@myrialabs/clopen 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (453) hide show
  1. package/.env.example +5 -5
  2. package/.github/workflows/{release.yml → ci.yml} +86 -60
  3. package/CONTRIBUTING.md +499 -499
  4. package/LICENSE +21 -21
  5. package/README.md +209 -209
  6. package/backend/index.ts +168 -156
  7. package/backend/lib/chat/helpers.ts +42 -42
  8. package/backend/lib/chat/index.ts +1 -1
  9. package/backend/lib/chat/stream-manager.ts +1126 -1126
  10. package/backend/lib/database/README.md +76 -76
  11. package/backend/lib/database/index.ts +118 -118
  12. package/backend/lib/database/migrations/001_create_projects_table.ts +30 -30
  13. package/backend/lib/database/migrations/002_create_chat_sessions_table.ts +32 -32
  14. package/backend/lib/database/migrations/003_create_messages_table.ts +31 -31
  15. package/backend/lib/database/migrations/004_create_prompt_templates_table.ts +34 -34
  16. package/backend/lib/database/migrations/005_create_settings_table.ts +23 -23
  17. package/backend/lib/database/migrations/006_add_user_to_messages.ts +57 -57
  18. package/backend/lib/database/migrations/007_create_stream_states_table.ts +40 -40
  19. package/backend/lib/database/migrations/008_create_message_snapshots_table.ts +61 -61
  20. package/backend/lib/database/migrations/009_add_delta_snapshot_fields.ts +41 -41
  21. package/backend/lib/database/migrations/010_add_soft_delete_and_branch_support.ts +70 -70
  22. package/backend/lib/database/migrations/011_git_like_commit_graph.ts +156 -156
  23. package/backend/lib/database/migrations/012_add_file_change_statistics.ts +41 -41
  24. package/backend/lib/database/migrations/013_checkpoint_tree_state.ts +118 -118
  25. package/backend/lib/database/migrations/014_add_engine_to_sessions.ts +18 -18
  26. package/backend/lib/database/migrations/015_add_model_to_sessions.ts +18 -18
  27. package/backend/lib/database/migrations/016_create_user_projects_table.ts +34 -34
  28. package/backend/lib/database/migrations/017_add_current_session_to_user_projects.ts +32 -32
  29. package/backend/lib/database/migrations/018_create_claude_accounts_table.ts +24 -24
  30. package/backend/lib/database/migrations/019_add_claude_account_to_sessions.ts +18 -18
  31. package/backend/lib/database/migrations/020_add_snapshot_tree_hash.ts +32 -32
  32. package/backend/lib/database/migrations/021_drop_prompt_templates_table.ts +33 -33
  33. package/backend/lib/database/migrations/index.ts +153 -153
  34. package/backend/lib/database/queries/checkpoint-queries.ts +87 -87
  35. package/backend/lib/database/queries/engine-queries.ts +75 -75
  36. package/backend/lib/database/queries/index.ts +8 -8
  37. package/backend/lib/database/queries/message-queries.ts +471 -471
  38. package/backend/lib/database/queries/project-queries.ts +117 -117
  39. package/backend/lib/database/queries/session-queries.ts +270 -270
  40. package/backend/lib/database/queries/settings-queries.ts +33 -33
  41. package/backend/lib/database/queries/snapshot-queries.ts +325 -325
  42. package/backend/lib/database/queries/utils-queries.ts +58 -58
  43. package/backend/lib/database/seeders/index.ts +12 -12
  44. package/backend/lib/database/seeders/settings_seeder.ts +83 -83
  45. package/backend/lib/database/utils/connection.ts +173 -173
  46. package/backend/lib/database/utils/index.ts +3 -3
  47. package/backend/lib/database/utils/migration-runner.ts +117 -117
  48. package/backend/lib/database/utils/seeder-runner.ts +120 -120
  49. package/backend/lib/engine/adapters/claude/environment.ts +160 -164
  50. package/backend/lib/engine/adapters/claude/error-handler.ts +60 -60
  51. package/backend/lib/engine/adapters/claude/index.ts +1 -1
  52. package/backend/lib/engine/adapters/claude/path-utils.ts +38 -38
  53. package/backend/lib/engine/adapters/claude/stream.ts +177 -177
  54. package/backend/lib/engine/adapters/opencode/index.ts +2 -2
  55. package/backend/lib/engine/adapters/opencode/message-converter.ts +862 -862
  56. package/backend/lib/engine/adapters/opencode/server.ts +104 -104
  57. package/backend/lib/engine/adapters/opencode/stream.ts +755 -755
  58. package/backend/lib/engine/index.ts +196 -196
  59. package/backend/lib/engine/types.ts +58 -58
  60. package/backend/lib/files/file-operations.ts +478 -478
  61. package/backend/lib/files/file-reading.ts +308 -308
  62. package/backend/lib/files/file-watcher.ts +383 -383
  63. package/backend/lib/files/path-browsing.ts +382 -382
  64. package/backend/lib/git/git-executor.ts +89 -88
  65. package/backend/lib/git/git-parser.ts +411 -411
  66. package/backend/lib/git/git-service.ts +505 -505
  67. package/backend/lib/mcp/README.md +1144 -1144
  68. package/backend/lib/mcp/config.ts +317 -316
  69. package/backend/lib/mcp/index.ts +35 -35
  70. package/backend/lib/mcp/project-context.ts +236 -236
  71. package/backend/lib/mcp/servers/browser-automation/actions.ts +156 -156
  72. package/backend/lib/mcp/servers/browser-automation/browser.ts +419 -419
  73. package/backend/lib/mcp/servers/browser-automation/index.ts +791 -791
  74. package/backend/lib/mcp/servers/browser-automation/inspection.ts +501 -501
  75. package/backend/lib/mcp/servers/helper.ts +143 -143
  76. package/backend/lib/mcp/servers/index.ts +44 -44
  77. package/backend/lib/mcp/servers/weather/get-temperature.ts +56 -56
  78. package/backend/lib/mcp/servers/weather/index.ts +31 -31
  79. package/backend/lib/mcp/stdio-server.ts +103 -103
  80. package/backend/lib/mcp/types.ts +65 -65
  81. package/backend/lib/preview/browser/browser-audio-capture.ts +86 -86
  82. package/backend/lib/preview/browser/browser-console-manager.ts +262 -262
  83. package/backend/lib/preview/browser/browser-dialog-handler.ts +222 -222
  84. package/backend/lib/preview/browser/browser-interaction-handler.ts +421 -421
  85. package/backend/lib/preview/browser/browser-mcp-control.ts +415 -415
  86. package/backend/lib/preview/browser/browser-native-ui-handler.ts +512 -512
  87. package/backend/lib/preview/browser/browser-navigation-tracker.ts +103 -103
  88. package/backend/lib/preview/browser/browser-pool.ts +357 -357
  89. package/backend/lib/preview/browser/browser-preview-service.ts +882 -882
  90. package/backend/lib/preview/browser/browser-tab-manager.ts +935 -935
  91. package/backend/lib/preview/browser/browser-video-capture.ts +695 -695
  92. package/backend/lib/preview/browser/scripts/audio-stream.ts +292 -292
  93. package/backend/lib/preview/browser/scripts/cursor-tracking.ts +85 -85
  94. package/backend/lib/preview/browser/scripts/video-stream.ts +438 -438
  95. package/backend/lib/preview/browser/types.ts +359 -359
  96. package/backend/lib/preview/index.ts +23 -23
  97. package/backend/lib/project/index.ts +1 -1
  98. package/backend/lib/project/status-manager.ts +181 -181
  99. package/backend/lib/shared/env.ts +117 -0
  100. package/backend/lib/shared/index.ts +5 -2
  101. package/backend/lib/shared/port-utils.ts +25 -25
  102. package/backend/lib/shared/process-manager.ts +280 -280
  103. package/backend/lib/snapshot/blob-store.ts +227 -227
  104. package/backend/lib/snapshot/gitignore.ts +307 -307
  105. package/backend/lib/snapshot/helpers.ts +397 -397
  106. package/backend/lib/snapshot/snapshot-service.ts +483 -483
  107. package/backend/lib/terminal/helpers.ts +14 -14
  108. package/backend/lib/terminal/index.ts +7 -7
  109. package/backend/lib/terminal/pty-manager.ts +3 -3
  110. package/backend/lib/terminal/pty-session-manager.ts +370 -387
  111. package/backend/lib/terminal/shell-utils.ts +315 -312
  112. package/backend/lib/terminal/stream-manager.ts +292 -292
  113. package/backend/lib/tunnel/global-tunnel-manager.ts +266 -243
  114. package/backend/lib/tunnel/project-tunnel-manager.ts +311 -311
  115. package/backend/lib/user/helpers.ts +87 -87
  116. package/backend/lib/utils/ws.ts +944 -944
  117. package/backend/lib/vite-dev.ts +295 -295
  118. package/backend/middleware/cors.ts +16 -15
  119. package/backend/middleware/error-handler.ts +50 -49
  120. package/backend/middleware/logger.ts +9 -9
  121. package/backend/types/api.ts +24 -24
  122. package/backend/ws/README.md +1505 -1505
  123. package/backend/ws/chat/background.ts +198 -198
  124. package/backend/ws/chat/index.ts +21 -21
  125. package/backend/ws/chat/stream.ts +707 -707
  126. package/backend/ws/engine/claude/accounts.ts +399 -401
  127. package/backend/ws/engine/claude/index.ts +13 -13
  128. package/backend/ws/engine/claude/status.ts +43 -43
  129. package/backend/ws/engine/index.ts +14 -14
  130. package/backend/ws/engine/opencode/index.ts +11 -11
  131. package/backend/ws/engine/opencode/status.ts +30 -30
  132. package/backend/ws/engine/utils.ts +36 -36
  133. package/backend/ws/files/index.ts +30 -30
  134. package/backend/ws/files/read.ts +189 -189
  135. package/backend/ws/files/search.ts +453 -453
  136. package/backend/ws/files/watch.ts +124 -124
  137. package/backend/ws/files/write.ts +143 -143
  138. package/backend/ws/git/branch.ts +106 -106
  139. package/backend/ws/git/commit.ts +39 -39
  140. package/backend/ws/git/conflict.ts +68 -68
  141. package/backend/ws/git/diff.ts +69 -69
  142. package/backend/ws/git/index.ts +24 -24
  143. package/backend/ws/git/log.ts +41 -41
  144. package/backend/ws/git/remote.ts +214 -214
  145. package/backend/ws/git/staging.ts +84 -84
  146. package/backend/ws/git/status.ts +90 -90
  147. package/backend/ws/index.ts +69 -69
  148. package/backend/ws/mcp/index.ts +61 -61
  149. package/backend/ws/messages/crud.ts +74 -74
  150. package/backend/ws/messages/index.ts +14 -14
  151. package/backend/ws/preview/browser/cleanup.ts +129 -129
  152. package/backend/ws/preview/browser/console.ts +114 -114
  153. package/backend/ws/preview/browser/interact.ts +513 -513
  154. package/backend/ws/preview/browser/mcp.ts +129 -129
  155. package/backend/ws/preview/browser/native-ui.ts +235 -235
  156. package/backend/ws/preview/browser/stats.ts +55 -55
  157. package/backend/ws/preview/browser/tab-info.ts +126 -126
  158. package/backend/ws/preview/browser/tab.ts +166 -166
  159. package/backend/ws/preview/browser/webcodecs.ts +293 -293
  160. package/backend/ws/preview/index.ts +146 -146
  161. package/backend/ws/projects/crud.ts +113 -113
  162. package/backend/ws/projects/index.ts +25 -25
  163. package/backend/ws/projects/presence.ts +46 -46
  164. package/backend/ws/projects/status.ts +116 -116
  165. package/backend/ws/sessions/crud.ts +327 -327
  166. package/backend/ws/sessions/index.ts +33 -33
  167. package/backend/ws/settings/crud.ts +112 -112
  168. package/backend/ws/settings/index.ts +14 -14
  169. package/backend/ws/snapshot/index.ts +17 -17
  170. package/backend/ws/snapshot/restore.ts +173 -173
  171. package/backend/ws/snapshot/timeline.ts +141 -141
  172. package/backend/ws/system/index.ts +14 -14
  173. package/backend/ws/system/operations.ts +49 -49
  174. package/backend/ws/terminal/index.ts +40 -40
  175. package/backend/ws/terminal/persistence.ts +153 -153
  176. package/backend/ws/terminal/session.ts +382 -382
  177. package/backend/ws/terminal/stream.ts +79 -79
  178. package/backend/ws/tunnel/index.ts +14 -14
  179. package/backend/ws/tunnel/operations.ts +91 -91
  180. package/backend/ws/types.ts +20 -20
  181. package/backend/ws/user/crud.ts +156 -156
  182. package/backend/ws/user/index.ts +14 -14
  183. package/bin/clopen.ts +307 -307
  184. package/bun.lock +1353 -1352
  185. package/frontend/App.svelte +38 -34
  186. package/frontend/app.css +313 -313
  187. package/frontend/lib/app-environment.ts +10 -10
  188. package/frontend/lib/components/chat/ChatInterface.svelte +406 -406
  189. package/frontend/lib/components/chat/formatters/ErrorMessage.svelte +56 -56
  190. package/frontend/lib/components/chat/formatters/MessageFormatter.svelte +223 -223
  191. package/frontend/lib/components/chat/formatters/TextMessage.svelte +394 -394
  192. package/frontend/lib/components/chat/formatters/Tools.svelte +69 -69
  193. package/frontend/lib/components/chat/formatters/index.ts +2 -2
  194. package/frontend/lib/components/chat/input/ChatInput.svelte +421 -421
  195. package/frontend/lib/components/chat/input/components/ChatInputActions.svelte +78 -78
  196. package/frontend/lib/components/chat/input/components/DragDropOverlay.svelte +30 -30
  197. package/frontend/lib/components/chat/input/components/EditModeIndicator.svelte +33 -33
  198. package/frontend/lib/components/chat/input/components/EngineModelPicker.svelte +619 -619
  199. package/frontend/lib/components/chat/input/components/FileAttachmentPreview.svelte +48 -48
  200. package/frontend/lib/components/chat/input/components/LoadingIndicator.svelte +31 -31
  201. package/frontend/lib/components/chat/input/composables/use-animations.svelte.ts +201 -201
  202. package/frontend/lib/components/chat/input/composables/use-chat-actions.svelte.ts +148 -148
  203. package/frontend/lib/components/chat/input/composables/use-file-handling.svelte.ts +216 -216
  204. package/frontend/lib/components/chat/input/composables/use-input-state.svelte.ts +357 -357
  205. package/frontend/lib/components/chat/input/composables/use-textarea-resize.svelte.ts +57 -57
  206. package/frontend/lib/components/chat/message/ChatMessage.svelte +478 -478
  207. package/frontend/lib/components/chat/message/ChatMessages.svelte +541 -541
  208. package/frontend/lib/components/chat/message/DateSeparator.svelte +86 -86
  209. package/frontend/lib/components/chat/message/MessageBubble.svelte +86 -86
  210. package/frontend/lib/components/chat/message/MessageHeader.svelte +157 -157
  211. package/frontend/lib/components/chat/modal/DebugModal.svelte +59 -59
  212. package/frontend/lib/components/chat/modal/TokenUsageModal.svelte +124 -124
  213. package/frontend/lib/components/chat/shared/index.ts +1 -1
  214. package/frontend/lib/components/chat/shared/utils.ts +115 -115
  215. package/frontend/lib/components/chat/tools/BashOutputTool.svelte +35 -35
  216. package/frontend/lib/components/chat/tools/BashTool.svelte +45 -45
  217. package/frontend/lib/components/chat/tools/CustomMcpTool.svelte +139 -139
  218. package/frontend/lib/components/chat/tools/EditTool.svelte +47 -47
  219. package/frontend/lib/components/chat/tools/ExitPlanModeTool.svelte +31 -31
  220. package/frontend/lib/components/chat/tools/GlobTool.svelte +50 -50
  221. package/frontend/lib/components/chat/tools/GrepTool.svelte +89 -89
  222. package/frontend/lib/components/chat/tools/KillShellTool.svelte +25 -25
  223. package/frontend/lib/components/chat/tools/ListMcpResourcesTool.svelte +30 -30
  224. package/frontend/lib/components/chat/tools/NotebookEditTool.svelte +37 -37
  225. package/frontend/lib/components/chat/tools/ReadMcpResourceTool.svelte +33 -33
  226. package/frontend/lib/components/chat/tools/ReadTool.svelte +40 -40
  227. package/frontend/lib/components/chat/tools/TaskTool.svelte +63 -63
  228. package/frontend/lib/components/chat/tools/TodoWriteTool.svelte +74 -74
  229. package/frontend/lib/components/chat/tools/WebFetchTool.svelte +34 -34
  230. package/frontend/lib/components/chat/tools/WebSearchTool.svelte +83 -83
  231. package/frontend/lib/components/chat/tools/WriteTool.svelte +32 -32
  232. package/frontend/lib/components/chat/tools/components/CodeBlock.svelte +78 -78
  233. package/frontend/lib/components/chat/tools/components/DiffBlock.svelte +407 -407
  234. package/frontend/lib/components/chat/tools/components/FileHeader.svelte +45 -45
  235. package/frontend/lib/components/chat/tools/components/InfoLine.svelte +18 -18
  236. package/frontend/lib/components/chat/tools/components/StatsBadges.svelte +26 -26
  237. package/frontend/lib/components/chat/tools/components/TerminalCommand.svelte +53 -53
  238. package/frontend/lib/components/chat/tools/components/index.ts +7 -7
  239. package/frontend/lib/components/chat/tools/index.ts +25 -25
  240. package/frontend/lib/components/chat/widgets/FloatingTodoList.svelte +248 -248
  241. package/frontend/lib/components/chat/widgets/TokenUsage.svelte +78 -78
  242. package/frontend/lib/components/checkpoint/TimelineModal.svelte +391 -391
  243. package/frontend/lib/components/checkpoint/timeline/TimelineEdge.svelte +26 -26
  244. package/frontend/lib/components/checkpoint/timeline/TimelineGraph.svelte +86 -86
  245. package/frontend/lib/components/checkpoint/timeline/TimelineNode.svelte +108 -108
  246. package/frontend/lib/components/checkpoint/timeline/TimelineVersionGroup.svelte +59 -59
  247. package/frontend/lib/components/checkpoint/timeline/animation.ts +168 -168
  248. package/frontend/lib/components/checkpoint/timeline/config.ts +44 -44
  249. package/frontend/lib/components/checkpoint/timeline/graph-builder.ts +304 -304
  250. package/frontend/lib/components/checkpoint/timeline/types.ts +65 -65
  251. package/frontend/lib/components/checkpoint/timeline/utils.ts +53 -53
  252. package/frontend/lib/components/common/Alert.svelte +138 -138
  253. package/frontend/lib/components/common/AvatarBubble.svelte +55 -55
  254. package/frontend/lib/components/common/Button.svelte +71 -71
  255. package/frontend/lib/components/common/Card.svelte +102 -102
  256. package/frontend/lib/components/common/Checkbox.svelte +48 -48
  257. package/frontend/lib/components/common/Dialog.svelte +248 -248
  258. package/frontend/lib/components/common/FolderBrowser.svelte +842 -842
  259. package/frontend/lib/components/common/Icon.svelte +57 -57
  260. package/frontend/lib/components/common/Input.svelte +72 -72
  261. package/frontend/lib/components/common/Lightbox.svelte +232 -232
  262. package/frontend/lib/components/common/LoadingScreen.svelte +52 -52
  263. package/frontend/lib/components/common/LoadingSpinner.svelte +48 -48
  264. package/frontend/lib/components/common/Modal.svelte +177 -177
  265. package/frontend/lib/components/common/ModalProvider.svelte +27 -27
  266. package/frontend/lib/components/common/ModelSelector.svelte +110 -110
  267. package/frontend/lib/components/common/MonacoEditor.svelte +568 -568
  268. package/frontend/lib/components/common/NotificationToast.svelte +113 -113
  269. package/frontend/lib/components/common/PageTemplate.svelte +75 -75
  270. package/frontend/lib/components/common/ProjectUserAvatars.svelte +79 -79
  271. package/frontend/lib/components/common/Select.svelte +97 -97
  272. package/frontend/lib/components/common/Textarea.svelte +79 -79
  273. package/frontend/lib/components/common/ThemeToggle.svelte +44 -44
  274. package/frontend/lib/components/common/lucide-icons.ts +1642 -1642
  275. package/frontend/lib/components/common/material-icons.ts +1082 -1082
  276. package/frontend/lib/components/common/xterm/XTerm.svelte +809 -795
  277. package/frontend/lib/components/common/xterm/index.ts +15 -15
  278. package/frontend/lib/components/common/xterm/terminal-config.ts +67 -67
  279. package/frontend/lib/components/common/xterm/types.ts +30 -30
  280. package/frontend/lib/components/common/xterm/xterm-service.ts +379 -353
  281. package/frontend/lib/components/files/FileNode.svelte +383 -383
  282. package/frontend/lib/components/files/FileTree.svelte +681 -681
  283. package/frontend/lib/components/files/FileViewer.svelte +728 -728
  284. package/frontend/lib/components/files/SearchResults.svelte +303 -303
  285. package/frontend/lib/components/git/BranchManager.svelte +458 -458
  286. package/frontend/lib/components/git/ChangesSection.svelte +107 -107
  287. package/frontend/lib/components/git/CommitForm.svelte +76 -76
  288. package/frontend/lib/components/git/ConflictResolver.svelte +158 -158
  289. package/frontend/lib/components/git/DiffViewer.svelte +364 -364
  290. package/frontend/lib/components/git/FileChangeItem.svelte +97 -97
  291. package/frontend/lib/components/git/GitButton.svelte +33 -33
  292. package/frontend/lib/components/git/GitLog.svelte +361 -361
  293. package/frontend/lib/components/git/GitModal.svelte +80 -80
  294. package/frontend/lib/components/history/HistoryModal.svelte +563 -563
  295. package/frontend/lib/components/history/HistoryView.svelte +614 -614
  296. package/frontend/lib/components/index.ts +34 -34
  297. package/frontend/lib/components/preview/browser/BrowserPreview.svelte +549 -549
  298. package/frontend/lib/components/preview/browser/components/Canvas.svelte +1058 -1058
  299. package/frontend/lib/components/preview/browser/components/ConsolePanel.svelte +756 -756
  300. package/frontend/lib/components/preview/browser/components/Container.svelte +450 -450
  301. package/frontend/lib/components/preview/browser/components/ContextMenu.svelte +236 -236
  302. package/frontend/lib/components/preview/browser/components/SelectDropdown.svelte +224 -224
  303. package/frontend/lib/components/preview/browser/components/Toolbar.svelte +338 -338
  304. package/frontend/lib/components/preview/browser/components/VirtualCursor.svelte +35 -35
  305. package/frontend/lib/components/preview/browser/core/cleanup.svelte.ts +155 -155
  306. package/frontend/lib/components/preview/browser/core/coordinator.svelte.ts +837 -837
  307. package/frontend/lib/components/preview/browser/core/interactions.svelte.ts +113 -113
  308. package/frontend/lib/components/preview/browser/core/mcp-handlers.svelte.ts +296 -296
  309. package/frontend/lib/components/preview/browser/core/native-ui-handlers.svelte.ts +391 -391
  310. package/frontend/lib/components/preview/browser/core/stream-handler.svelte.ts +231 -231
  311. package/frontend/lib/components/preview/browser/core/tab-manager.svelte.ts +210 -210
  312. package/frontend/lib/components/preview/browser/core/tab-operations.svelte.ts +239 -239
  313. package/frontend/lib/components/preview/index.ts +1 -1
  314. package/frontend/lib/components/settings/SettingsModal.svelte +235 -235
  315. package/frontend/lib/components/settings/SettingsView.svelte +36 -36
  316. package/frontend/lib/components/settings/appearance/AppearanceSettings.svelte +51 -51
  317. package/frontend/lib/components/settings/appearance/LayoutPresetSettings.svelte +160 -160
  318. package/frontend/lib/components/settings/appearance/LayoutPreview.svelte +76 -76
  319. package/frontend/lib/components/settings/engines/AIEnginesSettings.svelte +917 -917
  320. package/frontend/lib/components/settings/general/AdvancedSettings.svelte +187 -187
  321. package/frontend/lib/components/settings/general/DataManagementSettings.svelte +203 -203
  322. package/frontend/lib/components/settings/general/GeneralSettings.svelte +10 -10
  323. package/frontend/lib/components/settings/model/ModelSettings.svelte +357 -357
  324. package/frontend/lib/components/settings/notifications/NotificationSettings.svelte +205 -205
  325. package/frontend/lib/components/settings/user/UserSettings.svelte +197 -197
  326. package/frontend/lib/components/terminal/Terminal.svelte +367 -367
  327. package/frontend/lib/components/terminal/TerminalTabs.svelte +87 -87
  328. package/frontend/lib/components/terminal/TerminalView.svelte +54 -54
  329. package/frontend/lib/components/tunnel/TunnelActive.svelte +157 -142
  330. package/frontend/lib/components/tunnel/TunnelButton.svelte +60 -54
  331. package/frontend/lib/components/tunnel/TunnelInactive.svelte +285 -284
  332. package/frontend/lib/components/tunnel/TunnelModal.svelte +48 -47
  333. package/frontend/lib/components/tunnel/TunnelQRCode.svelte +49 -49
  334. package/frontend/lib/components/workspace/DesktopNavigator.svelte +382 -382
  335. package/frontend/lib/components/workspace/MobileNavigator.svelte +394 -403
  336. package/frontend/lib/components/workspace/PanelContainer.svelte +100 -100
  337. package/frontend/lib/components/workspace/PanelHeader.svelte +505 -505
  338. package/frontend/lib/components/workspace/ViewMenu.svelte +162 -162
  339. package/frontend/lib/components/workspace/WorkspaceLayout.svelte +169 -169
  340. package/frontend/lib/components/workspace/layout/DesktopLayout.svelte +15 -15
  341. package/frontend/lib/components/workspace/layout/MobileLayout.svelte +17 -17
  342. package/frontend/lib/components/workspace/layout/split-pane/Container.svelte +42 -42
  343. package/frontend/lib/components/workspace/layout/split-pane/Handle.svelte +84 -84
  344. package/frontend/lib/components/workspace/layout/split-pane/Layout.svelte +37 -37
  345. package/frontend/lib/components/workspace/panels/ChatPanel.svelte +274 -274
  346. package/frontend/lib/components/workspace/panels/FilesPanel.svelte +1261 -1261
  347. package/frontend/lib/components/workspace/panels/GitPanel.svelte +1560 -1560
  348. package/frontend/lib/components/workspace/panels/PreviewPanel.svelte +150 -150
  349. package/frontend/lib/components/workspace/panels/TerminalPanel.svelte +73 -73
  350. package/frontend/lib/constants/preview.ts +44 -44
  351. package/frontend/lib/services/chat/chat.service.ts +704 -704
  352. package/frontend/lib/services/chat/index.ts +6 -6
  353. package/frontend/lib/services/notification/global-stream-monitor.ts +86 -86
  354. package/frontend/lib/services/notification/index.ts +7 -7
  355. package/frontend/lib/services/notification/push.service.ts +143 -143
  356. package/frontend/lib/services/notification/sound.service.ts +126 -126
  357. package/frontend/lib/services/preview/browser/browser-console.service.ts +61 -61
  358. package/frontend/lib/services/preview/browser/browser-webcodecs.service.ts +1499 -1499
  359. package/frontend/lib/services/preview/browser/mcp-integration.svelte.ts +67 -67
  360. package/frontend/lib/services/preview/index.ts +22 -22
  361. package/frontend/lib/services/project/index.ts +7 -7
  362. package/frontend/lib/services/project/status.service.ts +159 -159
  363. package/frontend/lib/services/snapshot/snapshot.service.ts +47 -47
  364. package/frontend/lib/services/terminal/background/index.ts +129 -129
  365. package/frontend/lib/services/terminal/background/session-restore.ts +273 -273
  366. package/frontend/lib/services/terminal/background/stream-manager.ts +285 -285
  367. package/frontend/lib/services/terminal/index.ts +13 -13
  368. package/frontend/lib/services/terminal/persistence.service.ts +260 -260
  369. package/frontend/lib/services/terminal/project.service.ts +952 -952
  370. package/frontend/lib/services/terminal/session.service.ts +363 -363
  371. package/frontend/lib/services/terminal/terminal.service.ts +369 -369
  372. package/frontend/lib/stores/core/app.svelte.ts +117 -117
  373. package/frontend/lib/stores/core/files.svelte.ts +72 -72
  374. package/frontend/lib/stores/core/presence.svelte.ts +48 -48
  375. package/frontend/lib/stores/core/projects.svelte.ts +317 -317
  376. package/frontend/lib/stores/core/sessions.svelte.ts +383 -383
  377. package/frontend/lib/stores/features/claude-accounts.svelte.ts +58 -58
  378. package/frontend/lib/stores/features/models.svelte.ts +89 -89
  379. package/frontend/lib/stores/features/settings.svelte.ts +87 -87
  380. package/frontend/lib/stores/features/terminal.svelte.ts +700 -700
  381. package/frontend/lib/stores/features/tunnel.svelte.ts +163 -161
  382. package/frontend/lib/stores/features/user.svelte.ts +95 -95
  383. package/frontend/lib/stores/ui/chat-input.svelte.ts +56 -56
  384. package/frontend/lib/stores/ui/chat-model.svelte.ts +61 -61
  385. package/frontend/lib/stores/ui/dialog.svelte.ts +58 -58
  386. package/frontend/lib/stores/ui/edit-mode.svelte.ts +214 -214
  387. package/frontend/lib/stores/ui/notification.svelte.ts +166 -166
  388. package/frontend/lib/stores/ui/settings-modal.svelte.ts +88 -88
  389. package/frontend/lib/stores/ui/theme.svelte.ts +179 -179
  390. package/frontend/lib/stores/ui/workspace.svelte.ts +754 -754
  391. package/frontend/lib/types/native-ui.ts +73 -73
  392. package/frontend/lib/utils/chat/date-separator.ts +38 -38
  393. package/frontend/lib/utils/chat/message-grouper.ts +218 -218
  394. package/frontend/lib/utils/chat/message-processor.ts +134 -134
  395. package/frontend/lib/utils/chat/tool-handler.ts +160 -160
  396. package/frontend/lib/utils/chat/virtual-scroll.svelte.ts +142 -142
  397. package/frontend/lib/utils/click-outside.ts +20 -20
  398. package/frontend/lib/utils/context-manager.ts +256 -256
  399. package/frontend/lib/utils/file-icon-mappings.ts +768 -768
  400. package/frontend/lib/utils/folder-icon-mappings.ts +1029 -1029
  401. package/frontend/lib/utils/git-status.ts +68 -68
  402. package/frontend/lib/utils/platform.ts +112 -112
  403. package/frontend/lib/utils/port-check.ts +64 -64
  404. package/frontend/lib/utils/terminalFormatter.ts +206 -206
  405. package/frontend/lib/utils/theme.ts +6 -6
  406. package/frontend/lib/utils/tree-visualizer.ts +320 -320
  407. package/frontend/lib/utils/ws.ts +44 -44
  408. package/frontend/main.ts +13 -13
  409. package/index.html +70 -70
  410. package/package.json +111 -111
  411. package/scripts/generate-icons.ts +86 -86
  412. package/scripts/pre-publish-check.sh +142 -142
  413. package/scripts/setup-hooks.sh +134 -134
  414. package/scripts/validate-branch-name.sh +47 -47
  415. package/scripts/validate-commit-msg.sh +42 -42
  416. package/shared/constants/engines.ts +134 -134
  417. package/shared/types/database/connection.ts +15 -15
  418. package/shared/types/database/index.ts +5 -5
  419. package/shared/types/database/schema.ts +140 -140
  420. package/shared/types/engine/index.ts +45 -45
  421. package/shared/types/filesystem/index.ts +21 -21
  422. package/shared/types/git.ts +171 -171
  423. package/shared/types/messaging/index.ts +238 -238
  424. package/shared/types/messaging/tool.ts +525 -525
  425. package/shared/types/network/api.ts +17 -17
  426. package/shared/types/network/index.ts +4 -4
  427. package/shared/types/stores/app.ts +22 -22
  428. package/shared/types/stores/dialog.ts +20 -20
  429. package/shared/types/stores/index.ts +2 -2
  430. package/shared/types/stores/settings.ts +15 -15
  431. package/shared/types/terminal/index.ts +43 -43
  432. package/shared/types/ui/components.ts +60 -60
  433. package/shared/types/ui/icons.ts +22 -22
  434. package/shared/types/ui/index.ts +21 -21
  435. package/shared/types/ui/notifications.ts +13 -13
  436. package/shared/types/ui/theme.ts +11 -11
  437. package/shared/types/websocket/index.ts +43 -43
  438. package/shared/types/window.d.ts +12 -12
  439. package/shared/utils/anonymous-user.ts +167 -167
  440. package/shared/utils/async.ts +10 -10
  441. package/shared/utils/diff-calculator.ts +184 -184
  442. package/shared/utils/file-type-detection.ts +165 -165
  443. package/shared/utils/logger.ts +144 -158
  444. package/shared/utils/message-formatter.ts +79 -79
  445. package/shared/utils/path.ts +47 -47
  446. package/shared/utils/ws-client.ts +768 -768
  447. package/shared/utils/ws-server.ts +660 -660
  448. package/static/favicon.svg +7 -7
  449. package/static/fonts/dm-sans.css +96 -96
  450. package/svelte.config.js +20 -20
  451. package/tsconfig.json +41 -41
  452. package/vite.config.ts +33 -33
  453. package/.github/workflows/test.yml +0 -40
@@ -1,791 +1,791 @@
1
- /**
2
- * Browser Automation - Custom MCP Server
3
- *
4
- * Provides comprehensive browser automation tools for AI-driven testing and exploration.
5
- */
6
-
7
- import { z } from "zod";
8
- import { defineServer } from "../helper";
9
-
10
- // Import handlers
11
- import { listTabsHandler, switchTabHandler, openNewTabHandler, closeTabHandler, navigateHandler, setViewportHandler } from "./browser";
12
- import { actionsHandler } from "./actions";
13
- import { getConsoleLogsHandler, clearConsoleLogsHandler, executeConsoleHandler, takeScreenshotHandler, analyzeDomHandler } from "./inspection";
14
-
15
- export default defineServer({
16
- name: "browser-automation",
17
- version: "1.0.0",
18
- tools: {
19
- // ============================================================================
20
- // Browser Tab Management
21
- // ============================================================================
22
- "list_tabs": {
23
- description: `List all open browser tabs with metadata.
24
-
25
- WHEN TO USE:
26
- - Start of session to see available tabs
27
- - Before switching tabs
28
- - Verify tab state after open/close operations
29
-
30
- OUTPUT: Formatted text with tab index, ID, title, URL, and active indicator (*). Example:
31
- [1] tab-abc123 * Google Search - https://google.com
32
- [2] tab-def456 Calculator - https://calculator.net
33
-
34
- Use exact tab ID from output with switch_tab or close_tab - not the index number.`,
35
- handler: listTabsHandler
36
- },
37
-
38
- "switch_tab": {
39
- description: `Switch browser focus to different tab.
40
-
41
- WHEN TO USE:
42
- - Multi-tab workflows
43
- - Comparing content across tabs
44
- - Managing multiple sessions
45
-
46
- OUTPUT: Confirmation with switched tab's ID, title, and URL.
47
-
48
- Tab ID must be obtained from list_tabs. After switching, previous tab's context is lost - take new screenshot or analyze_dom if needed.`,
49
- schema: {
50
- tabId: z.string().min(1).describe("Tab ID obtained from list_tabs output (e.g., 'tab-abc123')")
51
- },
52
- handler: switchTabHandler
53
- },
54
-
55
- "open_new_tab": {
56
- description: `Create new browser tab with optional URL and viewport configuration.
57
-
58
- WHEN TO USE:
59
- - Starting fresh session
60
- - Opening multiple pages for comparison
61
- - Separating workflows
62
- - Testing risky actions in isolation
63
- - Testing responsive designs across devices
64
-
65
- OUTPUT: Tab ID, title, and URL of newly created tab. New tab automatically becomes active.
66
-
67
- Automatically creates session. New tab is immediately active - no need to switch_tab. If URL provided, waits for page load.
68
-
69
- VIEWPORT MODES:
70
- - desktop: 1920x1080 (default rotation: landscape) - For testing full desktop layouts
71
- - laptop: 1280x800 (default, default rotation: landscape) - Most common desktop viewport
72
- - tablet: 820x1050 (default rotation: portrait) - iPad-like tablet viewport
73
- - mobile: 393x740 (default rotation: portrait) - Modern smartphone viewport
74
-
75
- ROTATION:
76
- - portrait: Standard vertical orientation (default for tablet/mobile)
77
- - landscape: Horizontal orientation (default for desktop/laptop)
78
- - Omit rotation to use device-appropriate default`,
79
- schema: {
80
- url: z.string().url().optional().describe("Initial URL with protocol (http:// or https://). Omit for blank tab."),
81
- deviceSize: z.enum(['desktop', 'laptop', 'tablet', 'mobile']).optional().default('laptop').describe("Viewport device size (default: laptop). Choose based on testing needs."),
82
- rotation: z.enum(['portrait', 'landscape']).optional().describe("Screen orientation. If omitted, uses device-appropriate default: landscape for desktop/laptop, portrait for tablet/mobile.")
83
- },
84
- handler: openNewTabHandler
85
- },
86
-
87
- "close_tab": {
88
- description: `Close tab by ID and cleanup session.
89
-
90
- WHEN TO USE:
91
- - Cleanup after completing task
92
- - Managing memory/resources
93
- - Closing tabs no longer needed
94
-
95
- OUTPUT: Confirmation message. If closed tab was active, automatically switches to another tab and returns new active tab info.
96
-
97
- Cannot close last remaining tab (will error). If tab was active, focus moves to another tab - check list_tabs after.`,
98
- schema: {
99
- tabId: z.string().min(1).describe("Tab ID obtained from list_tabs output (e.g., 'tab-abc123')")
100
- },
101
- handler: closeTabHandler
102
- },
103
-
104
- "set_viewport": {
105
- description: `Change viewport settings (device size and rotation) for active tab.
106
-
107
- WHEN TO USE:
108
- - Testing responsive designs across devices
109
- - Switching between mobile and desktop views
110
- - Testing different screen orientations
111
- - Simulating different device types
112
-
113
- OUTPUT: Confirmation with tab ID and new viewport settings.
114
-
115
- VIEWPORT MODES:
116
- - desktop: 1920x1080 - For testing full desktop layouts
117
- - laptop: 1280x800 - Most common desktop viewport
118
- - tablet: 820x1050 - iPad-like tablet viewport
119
- - mobile: 393x740 - Modern smartphone viewport
120
-
121
- ROTATION:
122
- - portrait: Standard vertical orientation
123
- - landscape: Horizontal orientation (width and height swapped)
124
-
125
- The viewport change is applied immediately to the active tab. If testing multiple viewports, consider taking screenshots before and after to compare layouts.`,
126
- schema: {
127
- deviceSize: z.enum(['desktop', 'laptop', 'tablet', 'mobile']).optional().describe("Viewport device size. Omit to keep current device size."),
128
- rotation: z.enum(['portrait', 'landscape']).optional().describe("Screen orientation. Omit to keep current rotation.")
129
- },
130
- handler: setViewportHandler
131
- },
132
-
133
- // ============================================================================
134
- // Navigation
135
- // ============================================================================
136
- "navigate": {
137
- description: `Navigate to URL and wait for page load.
138
-
139
- PRIMARY USE: Following links when you have href URL (more efficient than clicking coordinates).
140
-
141
- WHEN TO USE:
142
- - Following links from analyze_dom.navigation.links (use href directly)
143
- - Moving to known URLs
144
- - Multi-page information extraction workflow
145
- - Refreshing with new parameters
146
-
147
- WHEN TO AVOID:
148
- - Don't have URL (must use take_screenshot + click instead)
149
- - Testing click interactions (use actions for actual click testing)
150
-
151
- EFFICIENT WORKFLOW:
152
-
153
- Information extraction across pages:
154
- → navigate(page1)
155
- → analyze_dom(['navigation', 'content'])
156
- → Find link: {text: "Pricing", href: "https://example.com/pricing"}
157
- → navigate(href) ← USE THIS, not click coordinates!
158
- → analyze_dom(['content'])
159
- → Extract information
160
-
161
- Why navigate > click for links:
162
- - Faster: direct URL navigation
163
- - More reliable: no coordinate dependency
164
- - Cleaner: no need for screenshot
165
- - Better for information tasks
166
-
167
- Example - Multi-page extraction:
168
- Task: "Find pricing at loom.com"
169
- → navigate("https://www.loom.com")
170
- → analyze_dom(['navigation'])
171
- → Found: {text: "Pricing", href: "https://www.loom.com/pricing"}
172
- → navigate("https://www.loom.com/pricing") ← Efficient!
173
- → analyze_dom(['content'])
174
- → Extract pricing info
175
-
176
- OUTPUT: Final URL after all redirects complete.
177
-
178
- After navigate, page is completely new:
179
- - Previous analyze_dom data is invalid - re-run on new page
180
- - Pattern for information: navigate → analyze_dom
181
- - Pattern for interaction: navigate → take_screenshot → actions
182
-
183
- Session state (cookies, localStorage) preserved. Timeout 30 seconds. Handles redirects automatically.`,
184
- schema: {
185
- url: z.string().url().describe("Target URL with protocol (http:// or https://). Redirects handled automatically.")
186
- },
187
- handler: navigateHandler
188
- },
189
-
190
- // ============================================================================
191
- // Browser Actions
192
- // ============================================================================
193
- "actions": {
194
- description: `Execute browser interactions in sequence. Each action completes before next begins.
195
-
196
- MANDATORY PLANNING PROTOCOL:
197
-
198
- BEFORE executing ANY action sequence, you MUST follow this protocol:
199
-
200
- 1. STATE THE GOAL
201
- Write down the exact outcome you want to achieve.
202
- Example: "Enter formula (8 × 5) + (12 ÷ 3) - 7 into calculator"
203
- Example: "Fill login form with username and password, then submit"
204
- Example: "Navigate through multi-step checkout process"
205
-
206
- 2. PLAN THE SEQUENCE
207
- List each action and its expected result/state after execution.
208
- IMPORTANT: Write this planning EVEN for TYPE actions - always document your plan.
209
-
210
- Example (Calculator):
211
- Action 1: Click "(" → Display shows "("
212
- Action 2: Click "8" → Display shows "(8"
213
- Action 3: Click "×" → Display shows "(8×"
214
- Action 4: Click "5" → Display shows "(8×5"
215
- Action 5: Click ")" → Display shows "(8×5)"
216
- Action 6: Click "+" → Display shows "(8×5)+"
217
- ... continue for each step
218
-
219
- Example (Form):
220
- Action 1: Click username field → Field focused
221
- Action 2: Type "user@example.com" → Username entered
222
- Action 3: Click password field → Password field focused
223
- Action 4: Type "password123" → Password entered (hidden)
224
- Action 5: Click submit button → Form submitted
225
-
226
- Example (Calculator with TYPE):
227
- Action 1: Click input field → Field focused
228
- Action 2: Type "(15*8)/(4+2)" → Formula entered
229
- Action 3: Click "=" → Result calculated
230
-
231
- 3. VERIFY THE LOGIC
232
- Trace through the sequence mentally:
233
- - Does each action move toward the goal?
234
- - What is the state after each action?
235
- - Are there dependencies between actions?
236
- - Could any action produce unintended results?
237
- - Is this the most efficient approach?
238
-
239
- 4. EFFICIENT ALTERNATIVES:
240
- Before using click sequences, verify:
241
- - Can I TYPE instead of clicking individual buttons?
242
- - Can I NAVIGATE instead of clicking coordinates?
243
- - Is there a simpler single-action approach?
244
-
245
- PREFER:
246
- - TYPE over click sequence (for formulas, text input)
247
- - NAVIGATE over click coordinates (for links)
248
- - Single action over multiple actions (when possible)
249
-
250
- Example - Calculator:
251
- INCORRECT: Click "(" → "1" → "5" → "×" → "8" → ")" → "/" → "(" → "4" → "+" → "2" → ")" → "=" (13 actions)
252
- CORRECT: If input field exists, type "(15*8)/(4+2)" then click "=" (2 actions)
253
-
254
- 5. EXECUTE ONLY AFTER VERIFICATION
255
- Build the action array only after steps 1-4 are complete and verified.
256
-
257
- 6. VERIFY RESULT AFTER EXECUTION
258
- After complex sequences complete:
259
- - Check if result matches your planned input/goal
260
- - If mismatch detected (e.g., planned sin(45) but got sin(42)):
261
- * Acknowledge: "Planned [X] but result shows [Y]"
262
- * Explain likely cause (wrong coordinate, button misidentification, etc.)
263
- * Decide: Continue with corrected understanding OR re-execute if critical
264
- - For calculator/form tasks, verify the final input/output matches expectation
265
-
266
- This protocol applies to ALL sequential actions: calculators, forms, navigation, games, automation workflows, etc.
267
-
268
- COMMON MISTAKES TO AVOID:
269
-
270
- Mistake: Not closing parentheses/brackets before next operation
271
- Example: Click "√" → "64" → "+" results in √(64+...) instead of √64 +
272
- Fix: Plan to close grouping before continuing: "√" → "64" → ")" → "+"
273
-
274
- Mistake: Wrong order in multi-step forms
275
- Example: Type in all fields → click submits → data goes to wrong inputs
276
- Fix: Click field → type → click next field → type (interleave click and type)
277
-
278
- Mistake: Not verifying state between dependent actions
279
- Example: Click modal button → immediately click behind it → clicks wrong element
280
- Fix: Add wait after modal opens, verify state, then continue
281
-
282
- PURPOSE: Perform user-like actions (clicking, typing, scrolling) for INTERACTION TASKS.
283
-
284
- TASK CLASSIFICATION:
285
-
286
- INTERACTION TASKS (use actions):
287
- - UI testing, automation testing
288
- - Form filling that requires interaction
289
- - Button clicking, element interaction
290
- → Workflow: take_screenshot → PLAN → VERIFY → actions
291
-
292
- INFORMATION TASKS (DON'T use actions for navigation):
293
- - Reading content, extracting information
294
- - Following links to other pages
295
- → Workflow: analyze_dom → navigate (use href, NOT click)
296
-
297
- WHEN TO USE:
298
- - After take_screenshot when you have coordinates
299
- - Clicking buttons, typing in forms, scrolling
300
- - Testing interactions, automating workflows
301
- - Solving obstacles (captcha, modals)
302
-
303
- WHEN TO AVOID:
304
- - Information extraction (use analyze_dom instead)
305
- - Following links to navigate (use navigate with href from analyze_dom, not click coordinates)
306
- - Reading text content (analyze_dom.content is faster)
307
-
308
- COORDINATE SOURCE:
309
-
310
- Coordinates come from AI vision analysis of screenshots, NOT from analyze_dom.
311
-
312
- Process for getting coordinates:
313
- 1. Call take_screenshot to capture current page
314
- 2. Use AI vision to identify element positions in screenshot
315
- 3. Determine pixel coordinates (x, y) from visual analysis
316
- 4. Apply PLANNING PROTOCOL before building actions
317
- 5. Use verified coordinates in actions
318
-
319
- Example - Click "Login" button:
320
- Task: Click button labeled "Login"
321
- → take_screenshot (capture page)
322
- → AI vision: "Login button at x=1200, y=45"
323
- → PLAN: Click button → form appears
324
- → VERIFY: Single click achieves goal
325
- → Execute: {type: "click", x: 1200, y: 45}
326
-
327
- Example - Sequential actions with planning:
328
- Task: Calculate (15 × 8) / (4 + 2) - 3²
329
-
330
- APPROACH A - If input field exists (PREFERRED):
331
- → take_screenshot
332
- → PLAN: Type formula directly, then calculate
333
- → VERIFY: Input field accepts text → yes
334
- → Execute: [
335
- {click input field x:640, y:311},
336
- {type "(15*8)/(4+2)-3^2"},
337
- {click equals x:913, y:479}
338
- ]
339
-
340
- APPROACH B - If must use buttons:
341
- → take_screenshot
342
- → PLAN: Each button click in exact order
343
- "(": x:672,y:320 → display "("
344
- "1": x:672,y:449 → display "(1"
345
- "5": x:730,y:405 → display "(15"
346
- ... (continue for all 15+ buttons)
347
- → VERIFY: Sequence produces correct formula
348
- → Execute: [full button sequence with waits]
349
-
350
- Example - Form filling with planning:
351
- Task: Fill and submit login form
352
- → take_screenshot
353
- → PLAN:
354
- 1. Click username field → focused
355
- 2. Type email → entered
356
- 3. Click password field → focused
357
- 4. Type password → entered (hidden)
358
- 5. Click submit → form submitted
359
- → VERIFY: Each field gets correct data in correct order
360
- → Execute: [
361
- {click x:640, y:300},
362
- {type "user@example.com"},
363
- {click x:640, y:380},
364
- {type "password123"},
365
- {click x:640, y:450}
366
- ]
367
-
368
- ACTION TYPES:
369
-
370
- click {x, y, click?: "left"/"right"/"middle"}
371
- Click at coordinates from screenshot AI vision analysis
372
-
373
- type {text?: "string", key?: "Enter", clearFirst?: true}
374
- text: String to type (click input field first)
375
- key: Special key (Enter, Tab, Escape, Backspace, ArrowUp, ArrowDown, etc)
376
- clearFirst: true (default) removes existing text
377
-
378
- move {x, y, steps?: 1}
379
- Move mouse to coordinates from screenshot (hover effects, tooltips)
380
-
381
- scroll {deltaX?: 0, deltaY: 500}
382
- deltaY positive=down, negative=up
383
-
384
- wait {delay: milliseconds}
385
- Pause between actions (use between dependent actions)
386
-
387
- extract_data {selector: "CSS selector"}
388
- Extract text/value from DOM (use after actions complete)
389
-
390
- OUTPUT:
391
- - Success: "Successfully performed N action(s) in sequence"
392
- - With extraction: Returns extracted text/value
393
- - Error: Descriptive message if action fails
394
-
395
- For complex sequences (>10 actions):
396
- - Consider breaking into smaller batches (5-8 actions per batch)
397
- - Take screenshot between batches to verify partial results
398
- - Adjust if needed before continuing
399
-
400
- Obstacle handling (captcha, modal, popup):
401
- - NEVER stop or ask user - solve automatically
402
- - Workflow: take_screenshot → identify → solve/dismiss → verify → continue
403
- - If fails after attempts: report failure, proceed if possible`,
404
- schema: {
405
- actions: z.array(
406
- z.discriminatedUnion("type", [
407
- z.object({
408
- type: z.literal("click"),
409
- x: z.number().describe("X coordinate from screenshot AI vision"),
410
- y: z.number().describe("Y coordinate from screenshot AI vision"),
411
- click: z.enum(["left", "right", "middle"]).optional().describe("Mouse button (default: left)")
412
- }),
413
- z.object({
414
- type: z.literal("type"),
415
- text: z.string().optional().describe("Text to type into focused element"),
416
- key: z.string().optional().describe("Special key (Enter, Tab, Escape, ArrowDown, Backspace, etc)"),
417
- clearFirst: z.boolean().optional().describe("Clear field before typing (default: true)")
418
- }),
419
- z.object({
420
- type: z.literal("move"),
421
- x: z.number().describe("X coordinate from screenshot AI vision"),
422
- y: z.number().describe("Y coordinate from screenshot AI vision"),
423
- steps: z.number().optional().describe("Steps for smooth movement (default: 1)")
424
- }),
425
- z.object({
426
- type: z.literal("scroll"),
427
- deltaX: z.number().optional().describe("Horizontal pixels (positive=right, negative=left)"),
428
- deltaY: z.number().optional().describe("Vertical pixels (positive=down, negative=up)"),
429
- smooth: z.boolean().optional().describe("Smooth animation (default: false)")
430
- }),
431
- z.object({
432
- type: z.literal("wait"),
433
- delay: z.number().describe("Wait duration in milliseconds before next action")
434
- }),
435
- z.object({
436
- type: z.literal("extract_data"),
437
- selector: z.string().describe("CSS selector (e.g., '#username') or element ID to extract data from")
438
- })
439
- ])
440
- ).min(1).describe("Actions to execute in sequence. Apply PLANNING PROTOCOL before building this array.")
441
- },
442
- handler: actionsHandler
443
- },
444
-
445
- // ============================================================================
446
- // Page Inspection
447
- // ============================================================================
448
- "analyze_dom": {
449
- description: `Extract page information - text content, links, structure, and page metadata.
450
-
451
- PRIMARY USE CASES:
452
-
453
- 1. WEBSITE EXPLORATION/CLONING
454
- Task keywords: "explore website", "clone website", "analyze site", "jelajahi website", "get all pages"
455
- → Use ONLY analyze_dom + navigate (NO screenshot needed)
456
- → Workflow: analyze_dom → navigate(href) → analyze_dom → repeat
457
-
458
- Example: "Clone example-company.com"
459
- CORRECT: analyze_dom → get links → navigate to each page → analyze_dom each page
460
- INCORRECT: analyze_dom → take_screenshot (expensive, unnecessary)
461
-
462
- 2. INFORMATION EXTRACTION
463
- Task keywords: "find", "read", "get", "extract", "search content"
464
- → Use analyze_dom for content, navigate for links
465
- → NO screenshot needed unless visual-only content
466
-
467
- 3. CONTENT RESEARCH
468
- Task keywords: "what does page say", "get article", "find documentation"
469
- → analyze_dom is PRIMARY method
470
- → Much faster and cheaper than screenshot
471
-
472
- TASK CLASSIFICATION:
473
-
474
- INFORMATION EXTRACTION TASKS (reading, finding, extracting, exploring):
475
- → PRIMARY: analyze_dom (fast, efficient, no screenshot needed)
476
- → SECONDARY: navigate (if need to follow links - use href, not click)
477
- → AVOID: take_screenshot (expensive, unnecessary for text content)
478
-
479
- Examples:
480
- - "Explore website and all pages" → analyze_dom + navigate only
481
- - "Clone website structure" → analyze_dom + navigate only
482
- - "Find pricing information" → analyze_dom.content
483
- - "Get all navigation links" → analyze_dom.navigation.links
484
- - "Read article summary" → analyze_dom.structure + content
485
- - "Find documentation URL" → analyze_dom.navigation.links → navigate(href)
486
-
487
- INTERACTION TASKS (clicking, typing, testing):
488
- → PRIMARY: take_screenshot (for coordinates)
489
- → SECONDARY: actions (execute interactions)
490
- → SUPPORT: analyze_dom (context only)
491
-
492
- Examples:
493
- - "Click login button" → take_screenshot → actions
494
- - "Fill form" → analyze_dom (optional context) → take_screenshot → actions
495
-
496
- WHEN TO USE:
497
- - Read page content (text, headings, paragraphs)
498
- - Get all links with URLs (for navigation)
499
- - Understand page structure and hierarchy
500
- - Extract form metadata
501
- - Check page properties (iframes, captcha)
502
-
503
- WHEN TO SKIP:
504
- - Need pixel coordinates for clicking (use take_screenshot instead)
505
- - Already have page information and unchanged
506
-
507
- EFFICIENT INFORMATION EXTRACTION WORKFLOW:
508
-
509
- Step 1: navigate to target page
510
- Step 2: analyze_dom (get content, links, structure)
511
- Step 3: If need more pages:
512
- - Find link in navigation.links (has href)
513
- - Use navigate(href) - NOT click coordinates
514
- - Repeat analyze_dom
515
- Step 4: Extract and return information - DONE
516
-
517
- NO screenshot needed for information tasks!
518
-
519
- Example - "Explore/clone website example-company.com":
520
- → open_new_tab("https://example-company.com")
521
- → analyze_dom(['navigation', 'structure', 'content']) - get ALL page info
522
- → Find links: About, Products, Blog, Contact (in navigation.links)
523
- → navigate("https://example-company.com/about")
524
- → analyze_dom(['structure', 'content']) - get About page info
525
- → navigate("https://example-company.com/products")
526
- → analyze_dom(['structure', 'content']) - get Products page info
527
- → Continue for all pages
528
- → DONE (NO screenshot needed at all!)
529
-
530
- Example - "Find pricing information at loom.com":
531
- → open_new_tab("https://www.loom.com")
532
- → analyze_dom(['navigation', 'content'])
533
- → Find pricing link: {text: "Pricing", href: "https://www.loom.com/pricing"}
534
- → navigate("https://www.loom.com/pricing")
535
- → analyze_dom(['content', 'structure'])
536
- → Extract pricing from content.paragraphs
537
- → DONE (no screenshot used!)
538
-
539
- OUTPUT STRUCTURE:
540
-
541
- navigation: {links: [{text, href}]}
542
- - ALL links on page (not just nav menu) - comprehensive extraction
543
- - Deduplicated by href
544
- - Use href with navigate tool (efficient)
545
- - Avoid clicking coordinates for navigation
546
-
547
- structure: {headings: [{level, text, id}], sections: [{heading, summary}]}
548
- - Page organization and hierarchy
549
-
550
- content: {paragraphs: [strings]}
551
- - Text content from various elements (p, div, li, td, span)
552
- - Deduplicated, up to 100 text items
553
- - Primary source for information extraction
554
-
555
- forms: [{formId, action, fields: [{label, type, name, placeholder, required, currentValue}]}]
556
- - Form structure metadata
557
-
558
- summary: {url, title, hasIframes, hasCaptcha, scrollableHeight, viewportHeight}
559
- - Page metadata
560
- - hasIframes/hasCaptcha: may need take_screenshot
561
-
562
- IMPORTANT:
563
-
564
- analyze_dom is PRIMARY method for information extraction tasks.
565
- Use navigate + analyze_dom workflow - more efficient than screenshot + click.
566
- Result becomes stale after navigation - re-run on new pages.`,
567
- schema: {
568
- include: z.array(z.enum(['navigation', 'structure', 'content', 'forms', 'summary'])).optional().describe("Sections to return. Omit for all. Use ['navigation', 'content'] for most reading tasks.")
569
- },
570
- handler: analyzeDomHandler
571
- },
572
-
573
- "take_screenshot": {
574
- description: `Capture viewport screenshot for visual analysis and coordinate determination.
575
-
576
- TASK CLASSIFICATION:
577
-
578
- Ask yourself: Is this an INTERACTION task or INFORMATION task?
579
-
580
- INTERACTION TASKS (clicking, typing, testing):
581
- → USE: take_screenshot (get coordinates via AI vision)
582
- → Keywords: "click", "fill form", "type", "test UI", "automate interaction", "submit"
583
- → THEN: Apply PLANNING PROTOCOL before actions (see actions tool)
584
-
585
- INFORMATION TASKS (reading, finding, exploring):
586
- → DO NOT USE: take_screenshot (expensive, unnecessary)
587
- → Keywords: "explore", "clone", "jelajahi", "find", "read", "get", "extract", "analyze"
588
- → USE INSTEAD: analyze_dom (10x faster, much cheaper)
589
-
590
- WHEN TO USE:
591
- - Need pixel coordinates for clicking/typing (analyze_dom has no coordinates)
592
- - Visual-only content (calculator displays, canvas, images, charts)
593
- - Iframe content (hasIframes=true - analyze_dom can't see inside)
594
- - Obstacles (captcha, popups, modals)
595
- - UI/visual verification (styling, layout, colors)
596
- - Before/after visual comparison
597
-
598
- WHEN TO SKIP:
599
- - Website exploration/cloning (use analyze_dom + navigate only)
600
- - Reading page content (analyze_dom.content is faster)
601
- - Getting links for navigation (analyze_dom.navigation.links)
602
- - Scrolling to "see more content" (analyze_dom already gets all DOM content)
603
- - Information extraction tasks (use analyze_dom instead)
604
- - Navigation (use navigate with href, not click coordinates)
605
- - Already have recent screenshot and page unchanged
606
-
607
- Cost consideration:
608
- - Screenshot is EXPENSIVE (image encoding, AI vision processing)
609
- - analyze_dom is CHEAP (text extraction, no image processing)
610
- - Rule: If task is reading/extracting information → use analyze_dom, NOT screenshot
611
-
612
- COMMON MISTAKES:
613
-
614
- Mistake: Using screenshot for website exploration
615
- Example: "Explore website" → take_screenshot → scroll → screenshot → scroll → screenshot
616
- Fix: Use analyze_dom + navigate only (no screenshot needed)
617
-
618
- Mistake: Screenshot to "see" text content
619
- Example: Want to read page → take_screenshot
620
- Fix: analyze_dom.content gives you all text directly
621
-
622
- Mistake: Screenshot + scroll to see full page
623
- Example: take_screenshot → scroll → screenshot → scroll
624
- Fix: analyze_dom gets entire DOM at once, no scrolling needed
625
-
626
- OUTPUT:
627
- PNG image (base64 encoded) showing visible viewport.
628
-
629
- WORKFLOW AFTER SCREENSHOT:
630
-
631
- After taking screenshot for interaction tasks:
632
- 1. AI vision: Analyze screenshot to identify element positions
633
- 2. Determine pixel coordinates (x, y) for target elements
634
- 3. Apply PLANNING PROTOCOL (see actions tool) before executing:
635
- - STATE THE GOAL
636
- - PLAN THE SEQUENCE (what happens after each action)
637
- - VERIFY THE LOGIC
638
- - EFFICIENT ALTERNATIVES
639
- - VERIFY RESULT after execution
640
- 4. Execute actions only after verification
641
-
642
- Example (INCORRECT) - Website exploration:
643
-
644
- Task: "Explore example-company.com and all pages"
645
- INCORRECT approach (expensive, slow, unnecessary):
646
- → open_new_tab
647
- → take_screenshot (not needed)
648
- → scroll → take_screenshot (repeat 5 times)
649
- → navigate to /about
650
- → take_screenshot (not needed)
651
- → scroll → take_screenshot
652
- Total: 10+ screenshots, very expensive!
653
-
654
- CORRECT approach (fast, efficient):
655
- → open_new_tab
656
- → analyze_dom (get all content + links)
657
- → navigate to /about
658
- → analyze_dom (get all content)
659
- → Continue for other pages
660
- Total: 0 screenshots, 10x faster!
661
-
662
- Example (CORRECT) - Button interaction:
663
-
664
- Task: "Click the login button"
665
- → take_screenshot (need coordinates for interaction)
666
- → AI vision: "Login button at x=1200, y=45"
667
- → PLAN: Click button → login form appears
668
- → VERIFY: Single action achieves goal
669
- → actions([{click, x:1200, y:45}])
670
-
671
- Example (CORRECT) - Sequential calculator actions:
672
-
673
- Task: Enter calculator formula (15 × 8) / (4 + 2)
674
- → take_screenshot (need coordinates for interaction)
675
- → AI vision: Identify all needed buttons and their positions
676
- → CHECK: Is there an input field? If yes, TYPE is more efficient!
677
- → If TYPE available:
678
- PLAN: Click input → type formula → calculate
679
- actions([{click x:640,y:311}, {type "(15*8)/(4+2)"}, {click x:913,y:479}])
680
- → If must use buttons:
681
- PLAN each click: "(" at x:672,y:320 → "1" at x:672,y:449 → ...
682
- VERIFY: Sequence produces correct formula in correct order
683
- actions([full sequence with verification])
684
-
685
- Example (CORRECT) - Form filling:
686
-
687
- Task: "Fill login form"
688
- → take_screenshot (need coordinates for interaction)
689
- → AI vision: username x:640,y:300; password x:640,y:380; submit x:640,y:450
690
- → PLAN:
691
- 1. Click username → focus
692
- 2. Type email → entered
693
- 3. Click password → focus
694
- 4. Type password → entered
695
- 5. Click submit → submitted
696
- → VERIFY: Correct order, correct data, correct targets
697
- → VERIFY RESULT: After execution, confirm form was filled correctly
698
- → actions([sequence])
699
-
700
- OBSTACLE HANDLING:
701
-
702
- When encountering obstacles (captcha, modals, popups, challenges):
703
- - DO NOT stop or ask user
704
- - Attempt to solve/dismiss automatically using AI vision
705
- - Continue original task after resolving
706
-
707
- Captcha workflow:
708
- 1. take_screenshot → identify captcha type
709
- 2. PLAN solution based on type:
710
- - Checkbox: click checkbox, wait, verify
711
- - Image selection: identify matching images, click sequence, verify
712
- - Text/math: solve, input, submit
713
- 3. Execute solution
714
- 4. take_screenshot (verify passed)
715
- 5. Continue task
716
-
717
- Modal/popup workflow:
718
- 1. take_screenshot → identify modal/popup
719
- 2. Find close button or dismiss action
720
- 3. actions([click dismiss])
721
- 4. Continue task
722
-
723
- IMPORTANT:
724
- - Coordinates from AI vision are pixel positions in screenshot
725
- - Can see inside iframes (unlike analyze_dom)
726
- - Always apply PLANNING PROTOCOL before executing sequential actions
727
- - When obstacles appear: solve automatically, never stop
728
- - After actions: take new screenshot to verify or find new elements`,
729
- handler: takeScreenshotHandler
730
- },
731
-
732
- "get_console_logs": {
733
- description: `Retrieve browser console logs for debugging.
734
-
735
- WHEN TO USE:
736
- - Debug JavaScript errors after actions fail
737
- - Monitor API calls or script execution
738
- - Detect client-side errors not visible in UI
739
- - Investigate unexpected behavior
740
-
741
- OUTPUT: Formatted text, most recent first:
742
- [2024-01-27 10:30:45] ERROR: Uncaught TypeError: Cannot read property 'value' of null
743
- [2024-01-27 10:30:44] WARN: Deprecated API usage
744
-
745
- Logs captured from browser console only - not backend logs. Default 20 entries, max 100. Logs persist until cleared or tab closed.`,
746
- schema: {
747
- limit: z.number().min(1).max(100).optional().default(20).describe("Maximum number of log entries to return (min: 1, max: 100, default: 20)")
748
- },
749
- handler: getConsoleLogsHandler
750
- },
751
-
752
- "clear_console_logs": {
753
- description: `Clear console logs from backend storage.
754
-
755
- WHEN TO USE:
756
- - Before important actions to isolate new logs
757
- - Clean slate for debugging specific operations
758
-
759
- OUTPUT: Success confirmation.
760
-
761
- Only clears backend storage, not browser console. Use before actions for clean debugging context.`,
762
- handler: clearConsoleLogsHandler
763
- },
764
-
765
- "execute_console": {
766
- description: `Execute JavaScript in browser console.
767
-
768
- WHEN TO USE:
769
- - Extract dynamic data not in DOM (computed styles, variables, runtime state)
770
- - Access browser APIs (localStorage, sessionStorage)
771
- - Debug page behavior
772
- - Verify JavaScript context
773
-
774
- COMMON COMMANDS:
775
- - document.title
776
- - window.location.href
777
- - localStorage.getItem("key")
778
- - document.querySelector("#id").value
779
- - Array.from(document.querySelectorAll(".item")).map(el => el.textContent)
780
-
781
- OUTPUT: Execution result as JSON string, or error message.
782
-
783
- Has full access to page context. Return value must be JSON-serializable. Use for extraction/debugging, not complex automation.`,
784
- schema: {
785
- command: z.string().min(1).describe("JavaScript expression or statement to execute (e.g., 'document.title', 'window.location.href', 'localStorage.getItem(\"key\")')")
786
- },
787
- handler: executeConsoleHandler
788
- },
789
-
790
- }
791
- });
1
+ /**
2
+ * Browser Automation - Custom MCP Server
3
+ *
4
+ * Provides comprehensive browser automation tools for AI-driven testing and exploration.
5
+ */
6
+
7
+ import { z } from "zod";
8
+ import { defineServer } from "../helper";
9
+
10
+ // Import handlers
11
+ import { listTabsHandler, switchTabHandler, openNewTabHandler, closeTabHandler, navigateHandler, setViewportHandler } from "./browser";
12
+ import { actionsHandler } from "./actions";
13
+ import { getConsoleLogsHandler, clearConsoleLogsHandler, executeConsoleHandler, takeScreenshotHandler, analyzeDomHandler } from "./inspection";
14
+
15
+ export default defineServer({
16
+ name: "browser-automation",
17
+ version: "1.0.0",
18
+ tools: {
19
+ // ============================================================================
20
+ // Browser Tab Management
21
+ // ============================================================================
22
+ "list_tabs": {
23
+ description: `List all open browser tabs with metadata.
24
+
25
+ WHEN TO USE:
26
+ - Start of session to see available tabs
27
+ - Before switching tabs
28
+ - Verify tab state after open/close operations
29
+
30
+ OUTPUT: Formatted text with tab index, ID, title, URL, and active indicator (*). Example:
31
+ [1] tab-abc123 * Google Search - https://google.com
32
+ [2] tab-def456 Calculator - https://calculator.net
33
+
34
+ Use exact tab ID from output with switch_tab or close_tab - not the index number.`,
35
+ handler: listTabsHandler
36
+ },
37
+
38
+ "switch_tab": {
39
+ description: `Switch browser focus to different tab.
40
+
41
+ WHEN TO USE:
42
+ - Multi-tab workflows
43
+ - Comparing content across tabs
44
+ - Managing multiple sessions
45
+
46
+ OUTPUT: Confirmation with switched tab's ID, title, and URL.
47
+
48
+ Tab ID must be obtained from list_tabs. After switching, previous tab's context is lost - take new screenshot or analyze_dom if needed.`,
49
+ schema: {
50
+ tabId: z.string().min(1).describe("Tab ID obtained from list_tabs output (e.g., 'tab-abc123')")
51
+ },
52
+ handler: switchTabHandler
53
+ },
54
+
55
+ "open_new_tab": {
56
+ description: `Create new browser tab with optional URL and viewport configuration.
57
+
58
+ WHEN TO USE:
59
+ - Starting fresh session
60
+ - Opening multiple pages for comparison
61
+ - Separating workflows
62
+ - Testing risky actions in isolation
63
+ - Testing responsive designs across devices
64
+
65
+ OUTPUT: Tab ID, title, and URL of newly created tab. New tab automatically becomes active.
66
+
67
+ Automatically creates session. New tab is immediately active - no need to switch_tab. If URL provided, waits for page load.
68
+
69
+ VIEWPORT MODES:
70
+ - desktop: 1920x1080 (default rotation: landscape) - For testing full desktop layouts
71
+ - laptop: 1280x800 (default, default rotation: landscape) - Most common desktop viewport
72
+ - tablet: 820x1050 (default rotation: portrait) - iPad-like tablet viewport
73
+ - mobile: 393x740 (default rotation: portrait) - Modern smartphone viewport
74
+
75
+ ROTATION:
76
+ - portrait: Standard vertical orientation (default for tablet/mobile)
77
+ - landscape: Horizontal orientation (default for desktop/laptop)
78
+ - Omit rotation to use device-appropriate default`,
79
+ schema: {
80
+ url: z.string().url().optional().describe("Initial URL with protocol (http:// or https://). Omit for blank tab."),
81
+ deviceSize: z.enum(['desktop', 'laptop', 'tablet', 'mobile']).optional().default('laptop').describe("Viewport device size (default: laptop). Choose based on testing needs."),
82
+ rotation: z.enum(['portrait', 'landscape']).optional().describe("Screen orientation. If omitted, uses device-appropriate default: landscape for desktop/laptop, portrait for tablet/mobile.")
83
+ },
84
+ handler: openNewTabHandler
85
+ },
86
+
87
+ "close_tab": {
88
+ description: `Close tab by ID and cleanup session.
89
+
90
+ WHEN TO USE:
91
+ - Cleanup after completing task
92
+ - Managing memory/resources
93
+ - Closing tabs no longer needed
94
+
95
+ OUTPUT: Confirmation message. If closed tab was active, automatically switches to another tab and returns new active tab info.
96
+
97
+ Cannot close last remaining tab (will error). If tab was active, focus moves to another tab - check list_tabs after.`,
98
+ schema: {
99
+ tabId: z.string().min(1).describe("Tab ID obtained from list_tabs output (e.g., 'tab-abc123')")
100
+ },
101
+ handler: closeTabHandler
102
+ },
103
+
104
+ "set_viewport": {
105
+ description: `Change viewport settings (device size and rotation) for active tab.
106
+
107
+ WHEN TO USE:
108
+ - Testing responsive designs across devices
109
+ - Switching between mobile and desktop views
110
+ - Testing different screen orientations
111
+ - Simulating different device types
112
+
113
+ OUTPUT: Confirmation with tab ID and new viewport settings.
114
+
115
+ VIEWPORT MODES:
116
+ - desktop: 1920x1080 - For testing full desktop layouts
117
+ - laptop: 1280x800 - Most common desktop viewport
118
+ - tablet: 820x1050 - iPad-like tablet viewport
119
+ - mobile: 393x740 - Modern smartphone viewport
120
+
121
+ ROTATION:
122
+ - portrait: Standard vertical orientation
123
+ - landscape: Horizontal orientation (width and height swapped)
124
+
125
+ The viewport change is applied immediately to the active tab. If testing multiple viewports, consider taking screenshots before and after to compare layouts.`,
126
+ schema: {
127
+ deviceSize: z.enum(['desktop', 'laptop', 'tablet', 'mobile']).optional().describe("Viewport device size. Omit to keep current device size."),
128
+ rotation: z.enum(['portrait', 'landscape']).optional().describe("Screen orientation. Omit to keep current rotation.")
129
+ },
130
+ handler: setViewportHandler
131
+ },
132
+
133
+ // ============================================================================
134
+ // Navigation
135
+ // ============================================================================
136
+ "navigate": {
137
+ description: `Navigate to URL and wait for page load.
138
+
139
+ PRIMARY USE: Following links when you have href URL (more efficient than clicking coordinates).
140
+
141
+ WHEN TO USE:
142
+ - Following links from analyze_dom.navigation.links (use href directly)
143
+ - Moving to known URLs
144
+ - Multi-page information extraction workflow
145
+ - Refreshing with new parameters
146
+
147
+ WHEN TO AVOID:
148
+ - Don't have URL (must use take_screenshot + click instead)
149
+ - Testing click interactions (use actions for actual click testing)
150
+
151
+ EFFICIENT WORKFLOW:
152
+
153
+ Information extraction across pages:
154
+ → navigate(page1)
155
+ → analyze_dom(['navigation', 'content'])
156
+ → Find link: {text: "Pricing", href: "https://example.com/pricing"}
157
+ → navigate(href) ← USE THIS, not click coordinates!
158
+ → analyze_dom(['content'])
159
+ → Extract information
160
+
161
+ Why navigate > click for links:
162
+ - Faster: direct URL navigation
163
+ - More reliable: no coordinate dependency
164
+ - Cleaner: no need for screenshot
165
+ - Better for information tasks
166
+
167
+ Example - Multi-page extraction:
168
+ Task: "Find pricing at loom.com"
169
+ → navigate("https://www.loom.com")
170
+ → analyze_dom(['navigation'])
171
+ → Found: {text: "Pricing", href: "https://www.loom.com/pricing"}
172
+ → navigate("https://www.loom.com/pricing") ← Efficient!
173
+ → analyze_dom(['content'])
174
+ → Extract pricing info
175
+
176
+ OUTPUT: Final URL after all redirects complete.
177
+
178
+ After navigate, page is completely new:
179
+ - Previous analyze_dom data is invalid - re-run on new page
180
+ - Pattern for information: navigate → analyze_dom
181
+ - Pattern for interaction: navigate → take_screenshot → actions
182
+
183
+ Session state (cookies, localStorage) preserved. Timeout 30 seconds. Handles redirects automatically.`,
184
+ schema: {
185
+ url: z.string().url().describe("Target URL with protocol (http:// or https://). Redirects handled automatically.")
186
+ },
187
+ handler: navigateHandler
188
+ },
189
+
190
+ // ============================================================================
191
+ // Browser Actions
192
+ // ============================================================================
193
+ "actions": {
194
+ description: `Execute browser interactions in sequence. Each action completes before next begins.
195
+
196
+ MANDATORY PLANNING PROTOCOL:
197
+
198
+ BEFORE executing ANY action sequence, you MUST follow this protocol:
199
+
200
+ 1. STATE THE GOAL
201
+ Write down the exact outcome you want to achieve.
202
+ Example: "Enter formula (8 × 5) + (12 ÷ 3) - 7 into calculator"
203
+ Example: "Fill login form with username and password, then submit"
204
+ Example: "Navigate through multi-step checkout process"
205
+
206
+ 2. PLAN THE SEQUENCE
207
+ List each action and its expected result/state after execution.
208
+ IMPORTANT: Write this planning EVEN for TYPE actions - always document your plan.
209
+
210
+ Example (Calculator):
211
+ Action 1: Click "(" → Display shows "("
212
+ Action 2: Click "8" → Display shows "(8"
213
+ Action 3: Click "×" → Display shows "(8×"
214
+ Action 4: Click "5" → Display shows "(8×5"
215
+ Action 5: Click ")" → Display shows "(8×5)"
216
+ Action 6: Click "+" → Display shows "(8×5)+"
217
+ ... continue for each step
218
+
219
+ Example (Form):
220
+ Action 1: Click username field → Field focused
221
+ Action 2: Type "user@example.com" → Username entered
222
+ Action 3: Click password field → Password field focused
223
+ Action 4: Type "password123" → Password entered (hidden)
224
+ Action 5: Click submit button → Form submitted
225
+
226
+ Example (Calculator with TYPE):
227
+ Action 1: Click input field → Field focused
228
+ Action 2: Type "(15*8)/(4+2)" → Formula entered
229
+ Action 3: Click "=" → Result calculated
230
+
231
+ 3. VERIFY THE LOGIC
232
+ Trace through the sequence mentally:
233
+ - Does each action move toward the goal?
234
+ - What is the state after each action?
235
+ - Are there dependencies between actions?
236
+ - Could any action produce unintended results?
237
+ - Is this the most efficient approach?
238
+
239
+ 4. EFFICIENT ALTERNATIVES:
240
+ Before using click sequences, verify:
241
+ - Can I TYPE instead of clicking individual buttons?
242
+ - Can I NAVIGATE instead of clicking coordinates?
243
+ - Is there a simpler single-action approach?
244
+
245
+ PREFER:
246
+ - TYPE over click sequence (for formulas, text input)
247
+ - NAVIGATE over click coordinates (for links)
248
+ - Single action over multiple actions (when possible)
249
+
250
+ Example - Calculator:
251
+ INCORRECT: Click "(" → "1" → "5" → "×" → "8" → ")" → "/" → "(" → "4" → "+" → "2" → ")" → "=" (13 actions)
252
+ CORRECT: If input field exists, type "(15*8)/(4+2)" then click "=" (2 actions)
253
+
254
+ 5. EXECUTE ONLY AFTER VERIFICATION
255
+ Build the action array only after steps 1-4 are complete and verified.
256
+
257
+ 6. VERIFY RESULT AFTER EXECUTION
258
+ After complex sequences complete:
259
+ - Check if result matches your planned input/goal
260
+ - If mismatch detected (e.g., planned sin(45) but got sin(42)):
261
+ * Acknowledge: "Planned [X] but result shows [Y]"
262
+ * Explain likely cause (wrong coordinate, button misidentification, etc.)
263
+ * Decide: Continue with corrected understanding OR re-execute if critical
264
+ - For calculator/form tasks, verify the final input/output matches expectation
265
+
266
+ This protocol applies to ALL sequential actions: calculators, forms, navigation, games, automation workflows, etc.
267
+
268
+ COMMON MISTAKES TO AVOID:
269
+
270
+ Mistake: Not closing parentheses/brackets before next operation
271
+ Example: Click "√" → "64" → "+" results in √(64+...) instead of √64 +
272
+ Fix: Plan to close grouping before continuing: "√" → "64" → ")" → "+"
273
+
274
+ Mistake: Wrong order in multi-step forms
275
+ Example: Type in all fields → click submits → data goes to wrong inputs
276
+ Fix: Click field → type → click next field → type (interleave click and type)
277
+
278
+ Mistake: Not verifying state between dependent actions
279
+ Example: Click modal button → immediately click behind it → clicks wrong element
280
+ Fix: Add wait after modal opens, verify state, then continue
281
+
282
+ PURPOSE: Perform user-like actions (clicking, typing, scrolling) for INTERACTION TASKS.
283
+
284
+ TASK CLASSIFICATION:
285
+
286
+ INTERACTION TASKS (use actions):
287
+ - UI testing, automation testing
288
+ - Form filling that requires interaction
289
+ - Button clicking, element interaction
290
+ → Workflow: take_screenshot → PLAN → VERIFY → actions
291
+
292
+ INFORMATION TASKS (DON'T use actions for navigation):
293
+ - Reading content, extracting information
294
+ - Following links to other pages
295
+ → Workflow: analyze_dom → navigate (use href, NOT click)
296
+
297
+ WHEN TO USE:
298
+ - After take_screenshot when you have coordinates
299
+ - Clicking buttons, typing in forms, scrolling
300
+ - Testing interactions, automating workflows
301
+ - Solving obstacles (captcha, modals)
302
+
303
+ WHEN TO AVOID:
304
+ - Information extraction (use analyze_dom instead)
305
+ - Following links to navigate (use navigate with href from analyze_dom, not click coordinates)
306
+ - Reading text content (analyze_dom.content is faster)
307
+
308
+ COORDINATE SOURCE:
309
+
310
+ Coordinates come from AI vision analysis of screenshots, NOT from analyze_dom.
311
+
312
+ Process for getting coordinates:
313
+ 1. Call take_screenshot to capture current page
314
+ 2. Use AI vision to identify element positions in screenshot
315
+ 3. Determine pixel coordinates (x, y) from visual analysis
316
+ 4. Apply PLANNING PROTOCOL before building actions
317
+ 5. Use verified coordinates in actions
318
+
319
+ Example - Click "Login" button:
320
+ Task: Click button labeled "Login"
321
+ → take_screenshot (capture page)
322
+ → AI vision: "Login button at x=1200, y=45"
323
+ → PLAN: Click button → form appears
324
+ → VERIFY: Single click achieves goal
325
+ → Execute: {type: "click", x: 1200, y: 45}
326
+
327
+ Example - Sequential actions with planning:
328
+ Task: Calculate (15 × 8) / (4 + 2) - 3²
329
+
330
+ APPROACH A - If input field exists (PREFERRED):
331
+ → take_screenshot
332
+ → PLAN: Type formula directly, then calculate
333
+ → VERIFY: Input field accepts text → yes
334
+ → Execute: [
335
+ {click input field x:640, y:311},
336
+ {type "(15*8)/(4+2)-3^2"},
337
+ {click equals x:913, y:479}
338
+ ]
339
+
340
+ APPROACH B - If must use buttons:
341
+ → take_screenshot
342
+ → PLAN: Each button click in exact order
343
+ "(": x:672,y:320 → display "("
344
+ "1": x:672,y:449 → display "(1"
345
+ "5": x:730,y:405 → display "(15"
346
+ ... (continue for all 15+ buttons)
347
+ → VERIFY: Sequence produces correct formula
348
+ → Execute: [full button sequence with waits]
349
+
350
+ Example - Form filling with planning:
351
+ Task: Fill and submit login form
352
+ → take_screenshot
353
+ → PLAN:
354
+ 1. Click username field → focused
355
+ 2. Type email → entered
356
+ 3. Click password field → focused
357
+ 4. Type password → entered (hidden)
358
+ 5. Click submit → form submitted
359
+ → VERIFY: Each field gets correct data in correct order
360
+ → Execute: [
361
+ {click x:640, y:300},
362
+ {type "user@example.com"},
363
+ {click x:640, y:380},
364
+ {type "password123"},
365
+ {click x:640, y:450}
366
+ ]
367
+
368
+ ACTION TYPES:
369
+
370
+ click {x, y, click?: "left"/"right"/"middle"}
371
+ Click at coordinates from screenshot AI vision analysis
372
+
373
+ type {text?: "string", key?: "Enter", clearFirst?: true}
374
+ text: String to type (click input field first)
375
+ key: Special key (Enter, Tab, Escape, Backspace, ArrowUp, ArrowDown, etc)
376
+ clearFirst: true (default) removes existing text
377
+
378
+ move {x, y, steps?: 1}
379
+ Move mouse to coordinates from screenshot (hover effects, tooltips)
380
+
381
+ scroll {deltaX?: 0, deltaY: 500}
382
+ deltaY positive=down, negative=up
383
+
384
+ wait {delay: milliseconds}
385
+ Pause between actions (use between dependent actions)
386
+
387
+ extract_data {selector: "CSS selector"}
388
+ Extract text/value from DOM (use after actions complete)
389
+
390
+ OUTPUT:
391
+ - Success: "Successfully performed N action(s) in sequence"
392
+ - With extraction: Returns extracted text/value
393
+ - Error: Descriptive message if action fails
394
+
395
+ For complex sequences (>10 actions):
396
+ - Consider breaking into smaller batches (5-8 actions per batch)
397
+ - Take screenshot between batches to verify partial results
398
+ - Adjust if needed before continuing
399
+
400
+ Obstacle handling (captcha, modal, popup):
401
+ - NEVER stop or ask user - solve automatically
402
+ - Workflow: take_screenshot → identify → solve/dismiss → verify → continue
403
+ - If fails after attempts: report failure, proceed if possible`,
404
+ schema: {
405
+ actions: z.array(
406
+ z.discriminatedUnion("type", [
407
+ z.object({
408
+ type: z.literal("click"),
409
+ x: z.number().describe("X coordinate from screenshot AI vision"),
410
+ y: z.number().describe("Y coordinate from screenshot AI vision"),
411
+ click: z.enum(["left", "right", "middle"]).optional().describe("Mouse button (default: left)")
412
+ }),
413
+ z.object({
414
+ type: z.literal("type"),
415
+ text: z.string().optional().describe("Text to type into focused element"),
416
+ key: z.string().optional().describe("Special key (Enter, Tab, Escape, ArrowDown, Backspace, etc)"),
417
+ clearFirst: z.boolean().optional().describe("Clear field before typing (default: true)")
418
+ }),
419
+ z.object({
420
+ type: z.literal("move"),
421
+ x: z.number().describe("X coordinate from screenshot AI vision"),
422
+ y: z.number().describe("Y coordinate from screenshot AI vision"),
423
+ steps: z.number().optional().describe("Steps for smooth movement (default: 1)")
424
+ }),
425
+ z.object({
426
+ type: z.literal("scroll"),
427
+ deltaX: z.number().optional().describe("Horizontal pixels (positive=right, negative=left)"),
428
+ deltaY: z.number().optional().describe("Vertical pixels (positive=down, negative=up)"),
429
+ smooth: z.boolean().optional().describe("Smooth animation (default: false)")
430
+ }),
431
+ z.object({
432
+ type: z.literal("wait"),
433
+ delay: z.number().describe("Wait duration in milliseconds before next action")
434
+ }),
435
+ z.object({
436
+ type: z.literal("extract_data"),
437
+ selector: z.string().describe("CSS selector (e.g., '#username') or element ID to extract data from")
438
+ })
439
+ ])
440
+ ).min(1).describe("Actions to execute in sequence. Apply PLANNING PROTOCOL before building this array.")
441
+ },
442
+ handler: actionsHandler
443
+ },
444
+
445
+ // ============================================================================
446
+ // Page Inspection
447
+ // ============================================================================
448
+ "analyze_dom": {
449
+ description: `Extract page information - text content, links, structure, and page metadata.
450
+
451
+ PRIMARY USE CASES:
452
+
453
+ 1. WEBSITE EXPLORATION/CLONING
454
+ Task keywords: "explore website", "clone website", "analyze site", "jelajahi website", "get all pages"
455
+ → Use ONLY analyze_dom + navigate (NO screenshot needed)
456
+ → Workflow: analyze_dom → navigate(href) → analyze_dom → repeat
457
+
458
+ Example: "Clone example-company.com"
459
+ CORRECT: analyze_dom → get links → navigate to each page → analyze_dom each page
460
+ INCORRECT: analyze_dom → take_screenshot (expensive, unnecessary)
461
+
462
+ 2. INFORMATION EXTRACTION
463
+ Task keywords: "find", "read", "get", "extract", "search content"
464
+ → Use analyze_dom for content, navigate for links
465
+ → NO screenshot needed unless visual-only content
466
+
467
+ 3. CONTENT RESEARCH
468
+ Task keywords: "what does page say", "get article", "find documentation"
469
+ → analyze_dom is PRIMARY method
470
+ → Much faster and cheaper than screenshot
471
+
472
+ TASK CLASSIFICATION:
473
+
474
+ INFORMATION EXTRACTION TASKS (reading, finding, extracting, exploring):
475
+ → PRIMARY: analyze_dom (fast, efficient, no screenshot needed)
476
+ → SECONDARY: navigate (if need to follow links - use href, not click)
477
+ → AVOID: take_screenshot (expensive, unnecessary for text content)
478
+
479
+ Examples:
480
+ - "Explore website and all pages" → analyze_dom + navigate only
481
+ - "Clone website structure" → analyze_dom + navigate only
482
+ - "Find pricing information" → analyze_dom.content
483
+ - "Get all navigation links" → analyze_dom.navigation.links
484
+ - "Read article summary" → analyze_dom.structure + content
485
+ - "Find documentation URL" → analyze_dom.navigation.links → navigate(href)
486
+
487
+ INTERACTION TASKS (clicking, typing, testing):
488
+ → PRIMARY: take_screenshot (for coordinates)
489
+ → SECONDARY: actions (execute interactions)
490
+ → SUPPORT: analyze_dom (context only)
491
+
492
+ Examples:
493
+ - "Click login button" → take_screenshot → actions
494
+ - "Fill form" → analyze_dom (optional context) → take_screenshot → actions
495
+
496
+ WHEN TO USE:
497
+ - Read page content (text, headings, paragraphs)
498
+ - Get all links with URLs (for navigation)
499
+ - Understand page structure and hierarchy
500
+ - Extract form metadata
501
+ - Check page properties (iframes, captcha)
502
+
503
+ WHEN TO SKIP:
504
+ - Need pixel coordinates for clicking (use take_screenshot instead)
505
+ - Already have page information and unchanged
506
+
507
+ EFFICIENT INFORMATION EXTRACTION WORKFLOW:
508
+
509
+ Step 1: navigate to target page
510
+ Step 2: analyze_dom (get content, links, structure)
511
+ Step 3: If need more pages:
512
+ - Find link in navigation.links (has href)
513
+ - Use navigate(href) - NOT click coordinates
514
+ - Repeat analyze_dom
515
+ Step 4: Extract and return information - DONE
516
+
517
+ NO screenshot needed for information tasks!
518
+
519
+ Example - "Explore/clone website example-company.com":
520
+ → open_new_tab("https://example-company.com")
521
+ → analyze_dom(['navigation', 'structure', 'content']) - get ALL page info
522
+ → Find links: About, Products, Blog, Contact (in navigation.links)
523
+ → navigate("https://example-company.com/about")
524
+ → analyze_dom(['structure', 'content']) - get About page info
525
+ → navigate("https://example-company.com/products")
526
+ → analyze_dom(['structure', 'content']) - get Products page info
527
+ → Continue for all pages
528
+ → DONE (NO screenshot needed at all!)
529
+
530
+ Example - "Find pricing information at loom.com":
531
+ → open_new_tab("https://www.loom.com")
532
+ → analyze_dom(['navigation', 'content'])
533
+ → Find pricing link: {text: "Pricing", href: "https://www.loom.com/pricing"}
534
+ → navigate("https://www.loom.com/pricing")
535
+ → analyze_dom(['content', 'structure'])
536
+ → Extract pricing from content.paragraphs
537
+ → DONE (no screenshot used!)
538
+
539
+ OUTPUT STRUCTURE:
540
+
541
+ navigation: {links: [{text, href}]}
542
+ - ALL links on page (not just nav menu) - comprehensive extraction
543
+ - Deduplicated by href
544
+ - Use href with navigate tool (efficient)
545
+ - Avoid clicking coordinates for navigation
546
+
547
+ structure: {headings: [{level, text, id}], sections: [{heading, summary}]}
548
+ - Page organization and hierarchy
549
+
550
+ content: {paragraphs: [strings]}
551
+ - Text content from various elements (p, div, li, td, span)
552
+ - Deduplicated, up to 100 text items
553
+ - Primary source for information extraction
554
+
555
+ forms: [{formId, action, fields: [{label, type, name, placeholder, required, currentValue}]}]
556
+ - Form structure metadata
557
+
558
+ summary: {url, title, hasIframes, hasCaptcha, scrollableHeight, viewportHeight}
559
+ - Page metadata
560
+ - hasIframes/hasCaptcha: may need take_screenshot
561
+
562
+ IMPORTANT:
563
+
564
+ analyze_dom is PRIMARY method for information extraction tasks.
565
+ Use navigate + analyze_dom workflow - more efficient than screenshot + click.
566
+ Result becomes stale after navigation - re-run on new pages.`,
567
+ schema: {
568
+ include: z.array(z.enum(['navigation', 'structure', 'content', 'forms', 'summary'])).optional().describe("Sections to return. Omit for all. Use ['navigation', 'content'] for most reading tasks.")
569
+ },
570
+ handler: analyzeDomHandler
571
+ },
572
+
573
+ "take_screenshot": {
574
+ description: `Capture viewport screenshot for visual analysis and coordinate determination.
575
+
576
+ TASK CLASSIFICATION:
577
+
578
+ Ask yourself: Is this an INTERACTION task or INFORMATION task?
579
+
580
+ INTERACTION TASKS (clicking, typing, testing):
581
+ → USE: take_screenshot (get coordinates via AI vision)
582
+ → Keywords: "click", "fill form", "type", "test UI", "automate interaction", "submit"
583
+ → THEN: Apply PLANNING PROTOCOL before actions (see actions tool)
584
+
585
+ INFORMATION TASKS (reading, finding, exploring):
586
+ → DO NOT USE: take_screenshot (expensive, unnecessary)
587
+ → Keywords: "explore", "clone", "jelajahi", "find", "read", "get", "extract", "analyze"
588
+ → USE INSTEAD: analyze_dom (10x faster, much cheaper)
589
+
590
+ WHEN TO USE:
591
+ - Need pixel coordinates for clicking/typing (analyze_dom has no coordinates)
592
+ - Visual-only content (calculator displays, canvas, images, charts)
593
+ - Iframe content (hasIframes=true - analyze_dom can't see inside)
594
+ - Obstacles (captcha, popups, modals)
595
+ - UI/visual verification (styling, layout, colors)
596
+ - Before/after visual comparison
597
+
598
+ WHEN TO SKIP:
599
+ - Website exploration/cloning (use analyze_dom + navigate only)
600
+ - Reading page content (analyze_dom.content is faster)
601
+ - Getting links for navigation (analyze_dom.navigation.links)
602
+ - Scrolling to "see more content" (analyze_dom already gets all DOM content)
603
+ - Information extraction tasks (use analyze_dom instead)
604
+ - Navigation (use navigate with href, not click coordinates)
605
+ - Already have recent screenshot and page unchanged
606
+
607
+ Cost consideration:
608
+ - Screenshot is EXPENSIVE (image encoding, AI vision processing)
609
+ - analyze_dom is CHEAP (text extraction, no image processing)
610
+ - Rule: If task is reading/extracting information → use analyze_dom, NOT screenshot
611
+
612
+ COMMON MISTAKES:
613
+
614
+ Mistake: Using screenshot for website exploration
615
+ Example: "Explore website" → take_screenshot → scroll → screenshot → scroll → screenshot
616
+ Fix: Use analyze_dom + navigate only (no screenshot needed)
617
+
618
+ Mistake: Screenshot to "see" text content
619
+ Example: Want to read page → take_screenshot
620
+ Fix: analyze_dom.content gives you all text directly
621
+
622
+ Mistake: Screenshot + scroll to see full page
623
+ Example: take_screenshot → scroll → screenshot → scroll
624
+ Fix: analyze_dom gets entire DOM at once, no scrolling needed
625
+
626
+ OUTPUT:
627
+ PNG image (base64 encoded) showing visible viewport.
628
+
629
+ WORKFLOW AFTER SCREENSHOT:
630
+
631
+ After taking screenshot for interaction tasks:
632
+ 1. AI vision: Analyze screenshot to identify element positions
633
+ 2. Determine pixel coordinates (x, y) for target elements
634
+ 3. Apply PLANNING PROTOCOL (see actions tool) before executing:
635
+ - STATE THE GOAL
636
+ - PLAN THE SEQUENCE (what happens after each action)
637
+ - VERIFY THE LOGIC
638
+ - EFFICIENT ALTERNATIVES
639
+ - VERIFY RESULT after execution
640
+ 4. Execute actions only after verification
641
+
642
+ Example (INCORRECT) - Website exploration:
643
+
644
+ Task: "Explore example-company.com and all pages"
645
+ INCORRECT approach (expensive, slow, unnecessary):
646
+ → open_new_tab
647
+ → take_screenshot (not needed)
648
+ → scroll → take_screenshot (repeat 5 times)
649
+ → navigate to /about
650
+ → take_screenshot (not needed)
651
+ → scroll → take_screenshot
652
+ Total: 10+ screenshots, very expensive!
653
+
654
+ CORRECT approach (fast, efficient):
655
+ → open_new_tab
656
+ → analyze_dom (get all content + links)
657
+ → navigate to /about
658
+ → analyze_dom (get all content)
659
+ → Continue for other pages
660
+ Total: 0 screenshots, 10x faster!
661
+
662
+ Example (CORRECT) - Button interaction:
663
+
664
+ Task: "Click the login button"
665
+ → take_screenshot (need coordinates for interaction)
666
+ → AI vision: "Login button at x=1200, y=45"
667
+ → PLAN: Click button → login form appears
668
+ → VERIFY: Single action achieves goal
669
+ → actions([{click, x:1200, y:45}])
670
+
671
+ Example (CORRECT) - Sequential calculator actions:
672
+
673
+ Task: Enter calculator formula (15 × 8) / (4 + 2)
674
+ → take_screenshot (need coordinates for interaction)
675
+ → AI vision: Identify all needed buttons and their positions
676
+ → CHECK: Is there an input field? If yes, TYPE is more efficient!
677
+ → If TYPE available:
678
+ PLAN: Click input → type formula → calculate
679
+ actions([{click x:640,y:311}, {type "(15*8)/(4+2)"}, {click x:913,y:479}])
680
+ → If must use buttons:
681
+ PLAN each click: "(" at x:672,y:320 → "1" at x:672,y:449 → ...
682
+ VERIFY: Sequence produces correct formula in correct order
683
+ actions([full sequence with verification])
684
+
685
+ Example (CORRECT) - Form filling:
686
+
687
+ Task: "Fill login form"
688
+ → take_screenshot (need coordinates for interaction)
689
+ → AI vision: username x:640,y:300; password x:640,y:380; submit x:640,y:450
690
+ → PLAN:
691
+ 1. Click username → focus
692
+ 2. Type email → entered
693
+ 3. Click password → focus
694
+ 4. Type password → entered
695
+ 5. Click submit → submitted
696
+ → VERIFY: Correct order, correct data, correct targets
697
+ → VERIFY RESULT: After execution, confirm form was filled correctly
698
+ → actions([sequence])
699
+
700
+ OBSTACLE HANDLING:
701
+
702
+ When encountering obstacles (captcha, modals, popups, challenges):
703
+ - DO NOT stop or ask user
704
+ - Attempt to solve/dismiss automatically using AI vision
705
+ - Continue original task after resolving
706
+
707
+ Captcha workflow:
708
+ 1. take_screenshot → identify captcha type
709
+ 2. PLAN solution based on type:
710
+ - Checkbox: click checkbox, wait, verify
711
+ - Image selection: identify matching images, click sequence, verify
712
+ - Text/math: solve, input, submit
713
+ 3. Execute solution
714
+ 4. take_screenshot (verify passed)
715
+ 5. Continue task
716
+
717
+ Modal/popup workflow:
718
+ 1. take_screenshot → identify modal/popup
719
+ 2. Find close button or dismiss action
720
+ 3. actions([click dismiss])
721
+ 4. Continue task
722
+
723
+ IMPORTANT:
724
+ - Coordinates from AI vision are pixel positions in screenshot
725
+ - Can see inside iframes (unlike analyze_dom)
726
+ - Always apply PLANNING PROTOCOL before executing sequential actions
727
+ - When obstacles appear: solve automatically, never stop
728
+ - After actions: take new screenshot to verify or find new elements`,
729
+ handler: takeScreenshotHandler
730
+ },
731
+
732
+ "get_console_logs": {
733
+ description: `Retrieve browser console logs for debugging.
734
+
735
+ WHEN TO USE:
736
+ - Debug JavaScript errors after actions fail
737
+ - Monitor API calls or script execution
738
+ - Detect client-side errors not visible in UI
739
+ - Investigate unexpected behavior
740
+
741
+ OUTPUT: Formatted text, most recent first:
742
+ [2024-01-27 10:30:45] ERROR: Uncaught TypeError: Cannot read property 'value' of null
743
+ [2024-01-27 10:30:44] WARN: Deprecated API usage
744
+
745
+ Logs captured from browser console only - not backend logs. Default 20 entries, max 100. Logs persist until cleared or tab closed.`,
746
+ schema: {
747
+ limit: z.number().min(1).max(100).optional().default(20).describe("Maximum number of log entries to return (min: 1, max: 100, default: 20)")
748
+ },
749
+ handler: getConsoleLogsHandler
750
+ },
751
+
752
+ "clear_console_logs": {
753
+ description: `Clear console logs from backend storage.
754
+
755
+ WHEN TO USE:
756
+ - Before important actions to isolate new logs
757
+ - Clean slate for debugging specific operations
758
+
759
+ OUTPUT: Success confirmation.
760
+
761
+ Only clears backend storage, not browser console. Use before actions for clean debugging context.`,
762
+ handler: clearConsoleLogsHandler
763
+ },
764
+
765
+ "execute_console": {
766
+ description: `Execute JavaScript in browser console.
767
+
768
+ WHEN TO USE:
769
+ - Extract dynamic data not in DOM (computed styles, variables, runtime state)
770
+ - Access browser APIs (localStorage, sessionStorage)
771
+ - Debug page behavior
772
+ - Verify JavaScript context
773
+
774
+ COMMON COMMANDS:
775
+ - document.title
776
+ - window.location.href
777
+ - localStorage.getItem("key")
778
+ - document.querySelector("#id").value
779
+ - Array.from(document.querySelectorAll(".item")).map(el => el.textContent)
780
+
781
+ OUTPUT: Execution result as JSON string, or error message.
782
+
783
+ Has full access to page context. Return value must be JSON-serializable. Use for extraction/debugging, not complex automation.`,
784
+ schema: {
785
+ command: z.string().min(1).describe("JavaScript expression or statement to execute (e.g., 'document.title', 'window.location.href', 'localStorage.getItem(\"key\")')")
786
+ },
787
+ handler: executeConsoleHandler
788
+ },
789
+
790
+ }
791
+ });