browser-use 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. package/README.md +295 -686
  2. package/dist/actor/element.d.ts +19 -0
  3. package/dist/actor/element.js +46 -0
  4. package/dist/actor/index.d.ts +4 -0
  5. package/dist/actor/index.js +4 -0
  6. package/dist/actor/mouse.d.ts +19 -0
  7. package/dist/actor/mouse.js +39 -0
  8. package/dist/actor/page.d.ts +29 -0
  9. package/dist/actor/page.js +88 -0
  10. package/dist/actor/utils.d.ts +4 -0
  11. package/dist/actor/utils.js +35 -0
  12. package/dist/agent/cloud-events.d.ts +18 -0
  13. package/dist/agent/cloud-events.js +65 -2
  14. package/dist/agent/gif.d.ts +1 -0
  15. package/dist/agent/gif.js +24 -2
  16. package/dist/agent/judge.d.ts +17 -0
  17. package/dist/agent/judge.js +197 -0
  18. package/dist/agent/message-manager/service.d.ts +12 -4
  19. package/dist/agent/message-manager/service.js +205 -39
  20. package/dist/agent/message-manager/utils.js +0 -1
  21. package/dist/agent/message-manager/views.d.ts +4 -0
  22. package/dist/agent/message-manager/views.js +11 -7
  23. package/dist/agent/prompts.d.ts +24 -3
  24. package/dist/agent/prompts.js +274 -59
  25. package/dist/agent/service.d.ts +99 -41
  26. package/dist/agent/service.js +2266 -472
  27. package/dist/agent/variable-detector.d.ts +12 -0
  28. package/dist/agent/variable-detector.js +211 -0
  29. package/dist/agent/views.d.ts +237 -18
  30. package/dist/agent/views.js +446 -33
  31. package/dist/browser/cloud/cloud.d.ts +20 -0
  32. package/dist/browser/cloud/cloud.js +129 -0
  33. package/dist/browser/cloud/index.d.ts +2 -0
  34. package/dist/browser/cloud/index.js +2 -0
  35. package/dist/browser/cloud/views.d.ts +41 -0
  36. package/dist/browser/cloud/views.js +35 -0
  37. package/dist/browser/events.d.ts +345 -0
  38. package/dist/browser/events.js +566 -0
  39. package/dist/browser/extensions.js +17 -17
  40. package/dist/browser/index.d.ts +4 -0
  41. package/dist/browser/index.js +4 -0
  42. package/dist/browser/profile.d.ts +8 -2
  43. package/dist/browser/profile.js +79 -12
  44. package/dist/browser/session-manager.d.ts +85 -0
  45. package/dist/browser/session-manager.js +208 -0
  46. package/dist/browser/session.d.ts +100 -8
  47. package/dist/browser/session.js +1097 -58
  48. package/dist/browser/types.d.ts +0 -2
  49. package/dist/browser/views.d.ts +39 -0
  50. package/dist/browser/views.js +32 -0
  51. package/dist/browser/watchdogs/aboutblank-watchdog.d.ts +12 -0
  52. package/dist/browser/watchdogs/aboutblank-watchdog.js +131 -0
  53. package/dist/browser/watchdogs/base.d.ts +21 -0
  54. package/dist/browser/watchdogs/base.js +81 -0
  55. package/dist/browser/watchdogs/cdp-session-watchdog.d.ts +14 -0
  56. package/dist/browser/watchdogs/cdp-session-watchdog.js +177 -0
  57. package/dist/browser/watchdogs/crash-watchdog.d.ts +38 -0
  58. package/dist/browser/watchdogs/crash-watchdog.js +296 -0
  59. package/dist/browser/watchdogs/default-action-watchdog.d.ts +49 -0
  60. package/dist/browser/watchdogs/default-action-watchdog.js +212 -0
  61. package/dist/browser/watchdogs/dom-watchdog.d.ts +8 -0
  62. package/dist/browser/watchdogs/dom-watchdog.js +31 -0
  63. package/dist/browser/watchdogs/downloads-watchdog.d.ts +77 -0
  64. package/dist/browser/watchdogs/downloads-watchdog.js +409 -0
  65. package/dist/browser/watchdogs/har-recording-watchdog.d.ts +19 -0
  66. package/dist/browser/watchdogs/har-recording-watchdog.js +317 -0
  67. package/dist/browser/watchdogs/index.d.ts +15 -0
  68. package/dist/browser/watchdogs/index.js +15 -0
  69. package/dist/browser/watchdogs/local-browser-watchdog.d.ts +10 -0
  70. package/dist/browser/watchdogs/local-browser-watchdog.js +32 -0
  71. package/dist/browser/watchdogs/permissions-watchdog.d.ts +8 -0
  72. package/dist/browser/watchdogs/permissions-watchdog.js +73 -0
  73. package/dist/browser/watchdogs/popups-watchdog.d.ts +13 -0
  74. package/dist/browser/watchdogs/popups-watchdog.js +77 -0
  75. package/dist/browser/watchdogs/recording-watchdog.d.ts +27 -0
  76. package/dist/browser/watchdogs/recording-watchdog.js +249 -0
  77. package/dist/browser/watchdogs/screenshot-watchdog.d.ts +6 -0
  78. package/dist/browser/watchdogs/screenshot-watchdog.js +13 -0
  79. package/dist/browser/watchdogs/security-watchdog.d.ts +10 -0
  80. package/dist/browser/watchdogs/security-watchdog.js +84 -0
  81. package/dist/browser/watchdogs/storage-state-watchdog.d.ts +24 -0
  82. package/dist/browser/watchdogs/storage-state-watchdog.js +288 -0
  83. package/dist/cli.d.ts +7 -2
  84. package/dist/cli.js +182 -25
  85. package/dist/code-use/formatting.d.ts +3 -0
  86. package/dist/code-use/formatting.js +18 -0
  87. package/dist/code-use/index.d.ts +6 -0
  88. package/dist/code-use/index.js +6 -0
  89. package/dist/code-use/namespace.d.ts +5 -0
  90. package/dist/code-use/namespace.js +81 -0
  91. package/dist/code-use/notebook-export.d.ts +3 -0
  92. package/dist/code-use/notebook-export.js +56 -0
  93. package/dist/code-use/service.d.ts +24 -0
  94. package/dist/code-use/service.js +104 -0
  95. package/dist/code-use/utils.d.ts +4 -0
  96. package/dist/code-use/utils.js +98 -0
  97. package/dist/code-use/views.d.ts +108 -0
  98. package/dist/code-use/views.js +165 -0
  99. package/dist/config.d.ts +13 -0
  100. package/dist/config.js +69 -3
  101. package/dist/controller/registry/service.d.ts +10 -1
  102. package/dist/controller/registry/service.js +266 -10
  103. package/dist/controller/registry/views.d.ts +4 -1
  104. package/dist/controller/registry/views.js +25 -2
  105. package/dist/controller/service.d.ts +10 -1
  106. package/dist/controller/service.js +1807 -268
  107. package/dist/controller/views.d.ts +78 -155
  108. package/dist/controller/views.js +61 -12
  109. package/dist/dom/history-tree-processor/service.d.ts +5 -0
  110. package/dist/dom/history-tree-processor/service.js +169 -14
  111. package/dist/dom/history-tree-processor/view.d.ts +7 -1
  112. package/dist/dom/history-tree-processor/view.js +10 -1
  113. package/dist/dom/markdown-extractor.d.ts +37 -0
  114. package/dist/dom/markdown-extractor.js +345 -0
  115. package/dist/dom/service.d.ts +3 -1
  116. package/dist/dom/service.js +76 -0
  117. package/dist/dom/views.d.ts +1 -0
  118. package/dist/dom/views.js +45 -0
  119. package/dist/event-bus.d.ts +107 -7
  120. package/dist/event-bus.js +313 -10
  121. package/dist/exceptions.d.ts +0 -3
  122. package/dist/exceptions.js +0 -7
  123. package/dist/filesystem/file-system.d.ts +18 -0
  124. package/dist/filesystem/file-system.js +503 -42
  125. package/dist/index.d.ts +7 -0
  126. package/dist/index.js +6 -0
  127. package/dist/integrations/gmail/actions.d.ts +3 -3
  128. package/dist/integrations/gmail/actions.js +4 -4
  129. package/dist/llm/anthropic/chat.d.ts +18 -1
  130. package/dist/llm/anthropic/chat.js +123 -55
  131. package/dist/llm/anthropic/serializer.d.ts +2 -0
  132. package/dist/llm/anthropic/serializer.js +81 -9
  133. package/dist/llm/aws/chat-anthropic.d.ts +17 -0
  134. package/dist/llm/aws/chat-anthropic.js +126 -26
  135. package/dist/llm/aws/chat-bedrock.d.ts +28 -1
  136. package/dist/llm/aws/chat-bedrock.js +161 -34
  137. package/dist/llm/aws/serializer.d.ts +13 -1
  138. package/dist/llm/aws/serializer.js +56 -17
  139. package/dist/llm/azure/chat.d.ts +53 -2
  140. package/dist/llm/azure/chat.js +366 -54
  141. package/dist/llm/base.d.ts +2 -0
  142. package/dist/llm/browser-use/chat.d.ts +40 -0
  143. package/dist/llm/browser-use/chat.js +305 -0
  144. package/dist/llm/browser-use/index.d.ts +1 -0
  145. package/dist/llm/browser-use/index.js +1 -0
  146. package/dist/llm/cerebras/chat.d.ts +39 -0
  147. package/dist/llm/cerebras/chat.js +178 -0
  148. package/dist/llm/cerebras/index.d.ts +2 -0
  149. package/dist/llm/cerebras/index.js +2 -0
  150. package/dist/llm/cerebras/serializer.d.ts +7 -0
  151. package/dist/llm/cerebras/serializer.js +82 -0
  152. package/dist/llm/deepseek/chat.d.ts +19 -2
  153. package/dist/llm/deepseek/chat.js +138 -25
  154. package/dist/llm/google/chat.d.ts +46 -2
  155. package/dist/llm/google/chat.js +267 -64
  156. package/dist/llm/google/serializer.d.ts +9 -1
  157. package/dist/llm/google/serializer.js +141 -34
  158. package/dist/llm/groq/chat.d.ts +21 -2
  159. package/dist/llm/groq/chat.js +125 -26
  160. package/dist/llm/groq/parser.js +3 -1
  161. package/dist/llm/mistral/chat.d.ts +43 -0
  162. package/dist/llm/mistral/chat.js +154 -0
  163. package/dist/llm/mistral/index.d.ts +2 -0
  164. package/dist/llm/mistral/index.js +2 -0
  165. package/dist/llm/mistral/schema.d.ts +8 -0
  166. package/dist/llm/mistral/schema.js +27 -0
  167. package/dist/llm/models.d.ts +2 -0
  168. package/dist/llm/models.js +317 -0
  169. package/dist/llm/ollama/chat.d.ts +13 -1
  170. package/dist/llm/ollama/chat.js +110 -19
  171. package/dist/llm/ollama/serializer.d.ts +1 -0
  172. package/dist/llm/ollama/serializer.js +34 -12
  173. package/dist/llm/openai/chat.d.ts +16 -0
  174. package/dist/llm/openai/chat.js +94 -44
  175. package/dist/llm/openai/like.d.ts +5 -3
  176. package/dist/llm/openai/like.js +7 -3
  177. package/dist/llm/openai/responses-serializer.d.ts +18 -0
  178. package/dist/llm/openai/responses-serializer.js +72 -0
  179. package/dist/llm/openrouter/chat.d.ts +28 -2
  180. package/dist/llm/openrouter/chat.js +115 -29
  181. package/dist/llm/schema.d.ts +11 -1
  182. package/dist/llm/schema.js +81 -1
  183. package/dist/llm/vercel/chat.d.ts +50 -0
  184. package/dist/llm/vercel/chat.js +276 -0
  185. package/dist/llm/vercel/index.d.ts +1 -0
  186. package/dist/llm/vercel/index.js +1 -0
  187. package/dist/llm/vercel/serializer.d.ts +5 -0
  188. package/dist/llm/vercel/serializer.js +7 -0
  189. package/dist/llm/views.d.ts +2 -1
  190. package/dist/llm/views.js +3 -1
  191. package/dist/logging-config.d.ts +2 -0
  192. package/dist/logging-config.js +82 -29
  193. package/dist/mcp/client.d.ts +10 -5
  194. package/dist/mcp/client.js +14 -9
  195. package/dist/mcp/controller.d.ts +42 -3
  196. package/dist/mcp/controller.js +56 -31
  197. package/dist/mcp/server.d.ts +14 -0
  198. package/dist/mcp/server.js +255 -52
  199. package/dist/observability.js +10 -4
  200. package/dist/sandbox/index.d.ts +2 -0
  201. package/dist/sandbox/index.js +2 -0
  202. package/dist/sandbox/sandbox.d.ts +19 -0
  203. package/dist/sandbox/sandbox.js +140 -0
  204. package/dist/sandbox/views.d.ts +67 -0
  205. package/dist/sandbox/views.js +121 -0
  206. package/dist/skill-cli/index.d.ts +3 -0
  207. package/dist/skill-cli/index.js +3 -0
  208. package/dist/skill-cli/protocol.d.ts +30 -0
  209. package/dist/skill-cli/protocol.js +48 -0
  210. package/dist/skill-cli/server.d.ts +11 -0
  211. package/dist/skill-cli/server.js +85 -0
  212. package/dist/skill-cli/sessions.d.ts +24 -0
  213. package/dist/skill-cli/sessions.js +47 -0
  214. package/dist/skills/index.d.ts +3 -0
  215. package/dist/skills/index.js +3 -0
  216. package/dist/skills/service.d.ts +27 -0
  217. package/dist/skills/service.js +266 -0
  218. package/dist/skills/utils.d.ts +6 -0
  219. package/dist/skills/utils.js +53 -0
  220. package/dist/skills/views.d.ts +40 -0
  221. package/dist/skills/views.js +10 -0
  222. package/dist/sync/auth.js +8 -3
  223. package/dist/sync/service.d.ts +6 -6
  224. package/dist/sync/service.js +54 -89
  225. package/dist/telemetry/views.d.ts +20 -6
  226. package/dist/telemetry/views.js +23 -5
  227. package/dist/tokens/custom-pricing.d.ts +2 -0
  228. package/dist/tokens/custom-pricing.js +22 -0
  229. package/dist/tokens/index.d.ts +2 -0
  230. package/dist/tokens/index.js +2 -0
  231. package/dist/tokens/mappings.d.ts +1 -0
  232. package/dist/tokens/mappings.js +3 -0
  233. package/dist/tokens/service.js +27 -8
  234. package/dist/tools/extraction/index.d.ts +2 -0
  235. package/dist/tools/extraction/index.js +2 -0
  236. package/dist/tools/extraction/schema-utils.d.ts +6 -0
  237. package/dist/tools/extraction/schema-utils.js +237 -0
  238. package/dist/tools/extraction/views.d.ts +7 -0
  239. package/dist/tools/index.d.ts +5 -0
  240. package/dist/tools/index.js +5 -0
  241. package/dist/tools/registry/index.d.ts +2 -0
  242. package/dist/tools/registry/index.js +2 -0
  243. package/dist/tools/registry/service.d.ts +1 -0
  244. package/dist/tools/registry/service.js +1 -0
  245. package/dist/tools/registry/views.d.ts +1 -0
  246. package/dist/tools/registry/views.js +1 -0
  247. package/dist/tools/service.d.ts +2 -0
  248. package/dist/tools/service.js +1 -0
  249. package/dist/tools/utils.d.ts +2 -0
  250. package/dist/tools/utils.js +57 -0
  251. package/dist/tools/views.d.ts +1 -0
  252. package/dist/tools/views.js +1 -0
  253. package/dist/utils.d.ts +10 -1
  254. package/dist/utils.js +70 -3
  255. package/package.json +87 -26
  256. package/dist/dom/playground/process-dom.js +0 -5
  257. package/dist/dom/playground/test-accessibility.d.ts +0 -44
  258. package/dist/dom/playground/test-accessibility.js +0 -111
  259. /package/dist/{dom/playground/process-dom.d.ts → tools/extraction/views.js} +0 -0
@@ -1,14 +1,20 @@
1
- import fs from 'node:fs';
1
+ import fs, { promises as fsp } from 'node:fs';
2
+ import path from 'node:path';
3
+ import { validate as validateJsonSchema } from '@cfworker/json-schema';
2
4
  import { ActionResult } from '../agent/views.js';
5
+ import { ClickCoordinateEvent, ClickElementEvent, CloseTabEvent, GetDropdownOptionsEvent, GoBackEvent, NavigateToUrlEvent, ScrollEvent, ScrollToTextEvent, ScreenshotEvent, SelectDropdownOptionEvent, SendKeysEvent, SwitchTabEvent, TypeTextEvent, UploadFileEvent, WaitEvent, } from '../browser/events.js';
3
6
  import { BrowserError } from '../browser/views.js';
4
- import { FileSystem } from '../filesystem/file-system.js';
5
- import { ClickElementActionSchema, CloseTabActionSchema, DoneActionSchema, ExtractStructuredDataActionSchema, DropdownOptionsActionSchema, SelectDropdownActionSchema, GoToUrlActionSchema, InputTextActionSchema, NoParamsActionSchema, ReadFileActionSchema, ReplaceFileStrActionSchema, ScrollActionSchema, ScrollToTextActionSchema, SearchGoogleActionSchema, StructuredOutputActionSchema, SwitchTabActionSchema, UploadFileActionSchema, WaitActionSchema, WriteFileActionSchema, SendKeysActionSchema, SheetsRangeActionSchema, SheetsUpdateActionSchema, SheetsInputActionSchema, } from './views.js';
7
+ import { chunkMarkdownByStructure, extractCleanMarkdownFromHtml, } from '../dom/markdown-extractor.js';
8
+ import { extractPdfText, FileSystem } from '../filesystem/file-system.js';
9
+ import { ClickElementActionIndexOnlySchema, ClickElementActionSchema, CloseTabActionSchema, DoneActionSchema, EvaluateActionSchema, ExtractStructuredDataActionSchema, FindElementsActionSchema, DropdownOptionsActionSchema, SelectDropdownActionSchema, GoToUrlActionSchema, InputTextActionSchema, NoParamsActionSchema, ReadLongContentActionSchema, ReadFileActionSchema, ReplaceFileStrActionSchema, ScrollActionSchema, ScrollToTextActionSchema, SearchActionSchema, SearchPageActionSchema, SearchGoogleActionSchema, ScreenshotActionSchema, StructuredOutputActionSchema, SwitchTabActionSchema, UploadFileActionSchema, WaitActionSchema, WriteFileActionSchema, SendKeysActionSchema, SheetsRangeActionSchema, SheetsUpdateActionSchema, SheetsInputActionSchema, } from './views.js';
6
10
  import { Registry } from './registry/service.js';
7
- import TurndownService from 'turndown';
8
- import { UserMessage } from '../llm/messages.js';
11
+ import { SystemMessage, UserMessage } from '../llm/messages.js';
9
12
  import { createLogger } from '../logging-config.js';
10
- const DEFAULT_WAIT_OFFSET = 3;
11
- const MAX_WAIT_SECONDS = 10;
13
+ import { sanitize_surrogates } from '../utils.js';
14
+ import { findUnsupportedJsonSchemaKeyword, normalizeStructuredDataBySchema, } from '../tools/extraction/schema-utils.js';
15
+ import { getClickDescription } from '../tools/utils.js';
16
+ const DEFAULT_WAIT_OFFSET = 1;
17
+ const MAX_WAIT_SECONDS = 30;
12
18
  const toActionEntries = (action) => {
13
19
  if (!action) {
14
20
  return [];
@@ -58,6 +64,13 @@ const waitWithSignal = async (timeoutMs, signal) => {
58
64
  }
59
65
  });
60
66
  };
67
+ const dispatchBrowserEventIfAvailable = async (browser_session, event, fallback) => {
68
+ if (typeof browser_session?.dispatch_browser_event === 'function') {
69
+ const dispatchResult = await browser_session.dispatch_browser_event(event);
70
+ return dispatchResult?.event?.event_result ?? null;
71
+ }
72
+ return fallback();
73
+ };
61
74
  const runWithTimeoutAndSignal = async (operation, timeoutMs, signal, timeoutMessage = 'Operation timed out') => {
62
75
  throwIfAborted(signal);
63
76
  if (timeoutMs <= 0) {
@@ -114,14 +127,33 @@ const runWithTimeoutAndSignal = async (operation, timeoutMs, signal, timeoutMess
114
127
  });
115
128
  });
116
129
  };
130
+ const validateAndFixJavaScript = (code) => {
131
+ let fixedCode = code;
132
+ // Fix double-escaped quotes often produced in tool-argument JSON.
133
+ fixedCode = fixedCode.replace(/\\"/g, '"');
134
+ // Fix over-escaped regex tokens (e.g. \\d -> \d).
135
+ fixedCode = fixedCode.replace(/\\\\([dDsSwWbBnrtfv])/g, '\\$1');
136
+ fixedCode = fixedCode.replace(/\\\\([.*+?^${}()|[\]])/g, '\\$1');
137
+ // Convert brittle mixed-quote selectors/XPaths into template literals.
138
+ fixedCode = fixedCode.replace(/document\.evaluate\s*\(\s*"([^"]*)"\s*,/g, (_match, xpath) => `document.evaluate(\`${xpath}\`,`);
139
+ fixedCode = fixedCode.replace(/(querySelector(?:All)?)\s*\(\s*"([^"]*)"\s*\)/g, (_match, methodName, selector) => `${methodName}(\`${selector}\`)`);
140
+ fixedCode = fixedCode.replace(/\.closest\s*\(\s*"([^"]*)"\s*\)/g, (_match, selector) => `.closest(\`${selector}\`)`);
141
+ fixedCode = fixedCode.replace(/\.matches\s*\(\s*"([^"]*)"\s*\)/g, (_match, selector) => `.matches(\`${selector}\`)`);
142
+ return fixedCode;
143
+ };
117
144
  export class Controller {
118
145
  registry;
119
146
  displayFilesInDoneText;
147
+ outputModel;
148
+ coordinateClickingEnabled;
149
+ clickActionHandler = null;
120
150
  logger;
121
151
  constructor(options = {}) {
122
152
  const { exclude_actions = [], output_model = null, display_files_in_done_text = true, } = options;
123
153
  this.registry = new Registry(exclude_actions);
124
154
  this.displayFilesInDoneText = display_files_in_done_text;
155
+ this.outputModel = output_model;
156
+ this.coordinateClickingEnabled = false;
125
157
  this.logger = createLogger('browser_use.controller');
126
158
  this.registerDefaultActions(output_model);
127
159
  }
@@ -131,15 +163,50 @@ export class Controller {
131
163
  this.registerElementActions();
132
164
  this.registerTabActions();
133
165
  this.registerContentActions();
166
+ this.registerExplorationActions();
134
167
  this.registerScrollActions();
135
168
  this.registerFileSystemActions();
169
+ this.registerUtilityActions();
136
170
  this.registerKeyboardActions();
137
171
  this.registerDropdownActions();
138
172
  this.registerSheetsActions();
139
173
  }
140
174
  registerNavigationActions() {
175
+ this.registry.action('Search the query on a web search engine (duckduckgo, google, or bing).', { param_model: SearchActionSchema, terminates_sequence: true })(async function search(params, { browser_session, signal }) {
176
+ if (!browser_session)
177
+ throw new Error('Browser session missing');
178
+ throwIfAborted(signal);
179
+ const requestedEngine = String(params.engine ?? 'duckduckgo');
180
+ const engine = requestedEngine.toLowerCase();
181
+ const encodedQuery = encodeURIComponent(params.query).replace(/%20/g, '+');
182
+ const searchUrlByEngine = {
183
+ duckduckgo: `https://duckduckgo.com/?q=${encodedQuery}`,
184
+ google: `https://www.google.com/search?q=${encodedQuery}&udm=14`,
185
+ bing: `https://www.bing.com/search?q=${encodedQuery}`,
186
+ };
187
+ const searchUrl = searchUrlByEngine[engine];
188
+ if (!searchUrl) {
189
+ return new ActionResult({
190
+ error: `Unsupported search engine: ${requestedEngine}. Options: duckduckgo, google, bing`,
191
+ });
192
+ }
193
+ try {
194
+ await browser_session.navigate_to(searchUrl, { signal });
195
+ const memory = `Searched ${requestedEngine} for '${params.query}'`;
196
+ return new ActionResult({
197
+ extracted_content: memory,
198
+ long_term_memory: memory,
199
+ });
200
+ }
201
+ catch (error) {
202
+ return new ActionResult({
203
+ error: `Failed to search ${requestedEngine} for "${params.query}": ${String(error?.message ?? error)}`,
204
+ });
205
+ }
206
+ });
141
207
  this.registry.action('Search the query in Google...', {
142
208
  param_model: SearchGoogleActionSchema,
209
+ terminates_sequence: true,
143
210
  })(async function search_google(params, { browser_session, signal }) {
144
211
  if (!browser_session)
145
212
  throw new Error('Browser session missing');
@@ -160,9 +227,7 @@ export class Controller {
160
227
  long_term_memory: `Searched Google for '${params.query}'`,
161
228
  });
162
229
  });
163
- this.registry.action('Navigate to URL...', {
164
- param_model: GoToUrlActionSchema,
165
- })(async function go_to_url(params, { browser_session, signal }) {
230
+ const navigateImpl = async function (params, { browser_session, signal, }) {
166
231
  if (!browser_session)
167
232
  throw new Error('Browser session missing');
168
233
  throwIfAborted(signal);
@@ -177,7 +242,7 @@ export class Controller {
177
242
  long_term_memory: `Opened new tab with URL ${params.url}`,
178
243
  });
179
244
  }
180
- await browser_session.navigate_to(params.url, { signal });
245
+ await dispatchBrowserEventIfAvailable(browser_session, new NavigateToUrlEvent({ url: params.url, new_tab: false }), () => browser_session.navigate_to(params.url, { signal }));
181
246
  const msg = `🔗 Navigated to ${params.url}`;
182
247
  return new ActionResult({
183
248
  extracted_content: msg,
@@ -195,44 +260,181 @@ export class Controller {
195
260
  'net::',
196
261
  ];
197
262
  if (networkFailures.some((needle) => errorMsg.includes(needle))) {
198
- const message = `Site unavailable: ${params.url} - ${errorMsg}`;
199
- throw new BrowserError(message);
263
+ return new ActionResult({
264
+ error: `Navigation failed - site unavailable: ${params.url}`,
265
+ });
200
266
  }
201
- throw error;
267
+ if (error instanceof Error &&
268
+ error.name === 'RuntimeError' &&
269
+ errorMsg.includes('CDP client not initialized')) {
270
+ return new ActionResult({
271
+ error: `Browser connection error: ${errorMsg}`,
272
+ });
273
+ }
274
+ return new ActionResult({
275
+ error: `Navigation failed: ${errorMsg}`,
276
+ });
202
277
  }
278
+ };
279
+ this.registry.action('Navigate to URL...', {
280
+ param_model: GoToUrlActionSchema,
281
+ terminates_sequence: true,
282
+ })(async function go_to_url(params, { browser_session, signal }) {
283
+ return navigateImpl(params, { browser_session, signal });
203
284
  });
204
- this.registry.action('Go back', { param_model: NoParamsActionSchema })(async function go_back(_params, { browser_session, signal }) {
285
+ this.registry.action('Navigate to URL...', {
286
+ param_model: GoToUrlActionSchema,
287
+ terminates_sequence: true,
288
+ })(async function navigate(params, { browser_session, signal }) {
289
+ return navigateImpl(params, { browser_session, signal });
290
+ });
291
+ this.registry.action('Go back', {
292
+ param_model: NoParamsActionSchema,
293
+ terminates_sequence: true,
294
+ })(async function go_back(_params, { browser_session, signal }) {
205
295
  if (!browser_session)
206
296
  throw new Error('Browser session missing');
207
297
  throwIfAborted(signal);
208
- await browser_session.go_back({ signal });
209
- const msg = '🔙 Navigated back';
210
- return new ActionResult({ extracted_content: msg });
298
+ try {
299
+ await dispatchBrowserEventIfAvailable(browser_session, new GoBackEvent(), () => browser_session.go_back({ signal }));
300
+ const memory = 'Navigated back';
301
+ return new ActionResult({ extracted_content: memory });
302
+ }
303
+ catch (error) {
304
+ return new ActionResult({
305
+ error: `Failed to go back: ${String(error?.message ?? error)}`,
306
+ });
307
+ }
211
308
  });
212
- this.registry.action('Wait for x seconds default 3 (max 10 seconds). This can be used to wait until the page is fully loaded.', { param_model: WaitActionSchema })(async function wait(params, { signal }) {
309
+ this.registry.action('Wait for x seconds.', {
310
+ param_model: WaitActionSchema,
311
+ })(async function wait(params, { signal, browser_session }) {
213
312
  const seconds = params.seconds ?? 3;
214
313
  const actualSeconds = Math.min(Math.max(seconds - DEFAULT_WAIT_OFFSET, 0), MAX_WAIT_SECONDS);
215
- const msg = `🕒 Waiting for ${actualSeconds + DEFAULT_WAIT_OFFSET} seconds`;
314
+ const msg = `🕒 Waited for ${seconds} second${seconds === 1 ? '' : 's'}`;
216
315
  if (actualSeconds > 0) {
217
- await waitWithSignal(actualSeconds * 1000, signal);
316
+ if (browser_session) {
317
+ await dispatchBrowserEventIfAvailable(browser_session, new WaitEvent({
318
+ seconds: actualSeconds,
319
+ max_seconds: MAX_WAIT_SECONDS,
320
+ }), () => waitWithSignal(actualSeconds * 1000, signal));
321
+ }
322
+ else {
323
+ await waitWithSignal(actualSeconds * 1000, signal);
324
+ }
218
325
  }
219
- return new ActionResult({ extracted_content: msg });
326
+ return new ActionResult({
327
+ extracted_content: msg,
328
+ long_term_memory: `Waited for ${seconds} second${seconds === 1 ? '' : 's'}`,
329
+ });
220
330
  });
221
331
  }
222
332
  registerElementActions() {
223
- this.registry.action('Click element by index', {
224
- param_model: ClickElementActionSchema,
225
- })(async function click_element_by_index(params, { browser_session, signal }) {
333
+ const logger = this.logger;
334
+ const convertLlmCoordinatesToViewport = (llmX, llmY, browserSession) => {
335
+ const llmSize = browserSession?.llm_screenshot_size;
336
+ const viewportSize = browserSession?._original_viewport_size;
337
+ if (!Array.isArray(llmSize) ||
338
+ llmSize.length !== 2 ||
339
+ !Array.isArray(viewportSize) ||
340
+ viewportSize.length !== 2) {
341
+ return [llmX, llmY];
342
+ }
343
+ const [llmWidth, llmHeight] = llmSize.map((value) => Number(value));
344
+ const [viewportWidth, viewportHeight] = viewportSize.map((value) => Number(value));
345
+ if (!Number.isFinite(llmWidth) ||
346
+ !Number.isFinite(llmHeight) ||
347
+ !Number.isFinite(viewportWidth) ||
348
+ !Number.isFinite(viewportHeight) ||
349
+ llmWidth <= 0 ||
350
+ llmHeight <= 0 ||
351
+ viewportWidth <= 0 ||
352
+ viewportHeight <= 0) {
353
+ return [llmX, llmY];
354
+ }
355
+ const actualX = Math.floor((llmX / llmWidth) * viewportWidth);
356
+ const actualY = Math.floor((llmY / llmHeight) * viewportHeight);
357
+ logger.info(`🔄 Converting coordinates: LLM (${llmX}, ${llmY}) @ ${llmWidth}x${llmHeight} -> Viewport (${actualX}, ${actualY}) @ ${viewportWidth}x${viewportHeight}`);
358
+ return [actualX, actualY];
359
+ };
360
+ const clickImpl = async (params, { browser_session, signal }) => {
226
361
  if (!browser_session)
227
362
  throw new Error('Browser session missing');
228
363
  throwIfAborted(signal);
229
- const element = await browser_session.get_dom_element_by_index(params.index, { signal });
364
+ const collectTabIds = () => {
365
+ if (!Array.isArray(browser_session.tabs)) {
366
+ return new Set();
367
+ }
368
+ return new Set(browser_session.tabs
369
+ .map((tab) => tab?.page_id)
370
+ .filter((pageId) => typeof pageId === 'number' && Number.isFinite(pageId)));
371
+ };
372
+ const detectNewTabNote = async (tabsBefore) => {
373
+ try {
374
+ await waitWithSignal(50, signal);
375
+ const tabsAfter = Array.isArray(browser_session.tabs)
376
+ ? browser_session.tabs
377
+ : [];
378
+ const newTab = tabsAfter.find((tab) => {
379
+ const pageId = tab?.page_id;
380
+ return typeof pageId === 'number' && !tabsBefore.has(pageId);
381
+ });
382
+ if (!newTab) {
383
+ return '';
384
+ }
385
+ const tabId = typeof newTab?.tab_id === 'string' && newTab.tab_id.trim()
386
+ ? newTab.tab_id.trim()
387
+ : String(newTab.page_id).padStart(4, '0').slice(-4);
388
+ return `. Note: This opened a new tab (tab_id: ${tabId}) - switch to it if you need to interact with the new page.`;
389
+ }
390
+ catch {
391
+ return '';
392
+ }
393
+ };
394
+ if (params.coordinate_x != null &&
395
+ params.coordinate_y != null &&
396
+ params.index == null) {
397
+ if (!this.coordinateClickingEnabled) {
398
+ throw new BrowserError('Coordinate clicking is disabled for the current model. Provide an element index.');
399
+ }
400
+ const tabsBefore = collectTabIds();
401
+ const page = await browser_session.get_current_page();
402
+ if (!page?.mouse?.click) {
403
+ throw new BrowserError('Unable to perform coordinate click on the current page.');
404
+ }
405
+ const [actualX, actualY] = convertLlmCoordinatesToViewport(params.coordinate_x, params.coordinate_y, browser_session);
406
+ await dispatchBrowserEventIfAvailable(browser_session, new ClickCoordinateEvent({
407
+ coordinate_x: actualX,
408
+ coordinate_y: actualY,
409
+ }), () => page.mouse.click(actualX, actualY));
410
+ const coordinateMessage = `🖱️ Clicked at coordinates (${params.coordinate_x}, ${params.coordinate_y})` +
411
+ (await detectNewTabNote(tabsBefore));
412
+ return new ActionResult({
413
+ extracted_content: coordinateMessage,
414
+ include_in_memory: true,
415
+ long_term_memory: coordinateMessage,
416
+ metadata: {
417
+ click_x: actualX,
418
+ click_y: actualY,
419
+ },
420
+ });
421
+ }
422
+ if (params.index == null) {
423
+ return new ActionResult({
424
+ error: 'Must provide either index or both coordinate_x and coordinate_y',
425
+ });
426
+ }
427
+ const element = await browser_session.get_dom_element_by_index(params.index, {
428
+ signal,
429
+ });
230
430
  if (!element) {
231
- throw new BrowserError(`Element index ${params.index} does not exist - retry or use alternative actions`);
431
+ const msg = `Element index ${params.index} not available - page may have changed. Try refreshing browser state.`;
432
+ logger.warning(`⚠️ ${msg}`);
433
+ return new ActionResult({
434
+ extracted_content: msg,
435
+ });
232
436
  }
233
- const initialTabs = Array.isArray(browser_session.tabs)
234
- ? browser_session.tabs.length
235
- : 0;
437
+ const tabsBefore = collectTabIds();
236
438
  if (browser_session.is_file_input?.(element)) {
237
439
  const msg = `Index ${params.index} - has an element which opens file upload dialog.`;
238
440
  return new ActionResult({
@@ -242,124 +444,430 @@ export class Controller {
242
444
  long_term_memory: msg,
243
445
  });
244
446
  }
245
- const downloadPath = await browser_session._click_element_node(element, {
447
+ const downloadPath = await dispatchBrowserEventIfAvailable(browser_session, new ClickElementEvent({
448
+ node: element,
449
+ button: 'left',
450
+ }), () => browser_session._click_element_node(element, {
246
451
  signal,
247
- });
248
- let msg = '';
452
+ }));
453
+ let msg;
249
454
  if (downloadPath) {
250
455
  msg = `💾 Downloaded file to ${downloadPath}`;
251
456
  }
252
457
  else {
253
- const snippet = element.get_all_text_till_next_clickable_element?.(2) ?? '';
254
- msg = `🖱️ Clicked button with index ${params.index}: ${snippet}`;
255
- }
256
- if (Array.isArray(browser_session.tabs) &&
257
- browser_session.tabs.length > initialTabs) {
258
- msg += ' - New tab opened - switching to it';
259
- await browser_session.switch_to_tab(-1, { signal });
458
+ let elementDescription = '';
459
+ if (typeof element?.tag_name === 'string' &&
460
+ typeof element?.get_all_text_till_next_clickable_element ===
461
+ 'function') {
462
+ try {
463
+ elementDescription = getClickDescription(element);
464
+ }
465
+ catch {
466
+ elementDescription = '';
467
+ }
468
+ }
469
+ if (elementDescription) {
470
+ msg = `🖱️ Clicked ${elementDescription}`;
471
+ }
472
+ else {
473
+ const snippet = element.get_all_text_till_next_clickable_element?.(2) ?? '';
474
+ msg = `🖱️ Clicked button with index ${params.index}: ${snippet}`;
475
+ }
260
476
  }
477
+ msg += await detectNewTabNote(tabsBefore);
261
478
  return new ActionResult({
262
479
  extracted_content: msg,
263
480
  include_in_memory: true,
264
481
  long_term_memory: msg,
265
482
  });
266
- });
267
- this.registry.action('Click and input text into an input interactive element', { param_model: InputTextActionSchema })(async function input_text(params, { browser_session, has_sensitive_data, signal }) {
483
+ };
484
+ this.clickActionHandler = clickImpl;
485
+ this.registerClickActions();
486
+ const detectSensitiveKeyName = (value, sensitiveData) => {
487
+ if (!value || !sensitiveData) {
488
+ return null;
489
+ }
490
+ for (const [domainOrKey, content] of Object.entries(sensitiveData)) {
491
+ if (typeof content === 'string') {
492
+ if (content === value) {
493
+ return domainOrKey;
494
+ }
495
+ continue;
496
+ }
497
+ if (!content || typeof content !== 'object') {
498
+ continue;
499
+ }
500
+ for (const [key, nestedValue] of Object.entries(content)) {
501
+ if (nestedValue === value) {
502
+ return key;
503
+ }
504
+ }
505
+ }
506
+ return null;
507
+ };
508
+ const inputImpl = async function (params, { browser_session, has_sensitive_data, sensitive_data, signal, }) {
268
509
  if (!browser_session)
269
510
  throw new Error('Browser session missing');
270
511
  throwIfAborted(signal);
271
512
  const element = await browser_session.get_dom_element_by_index(params.index, { signal });
272
513
  if (!element) {
273
- throw new BrowserError(`Element index ${params.index} does not exist - retry or use alternative actions`);
514
+ const msg = `Element index ${params.index} not available - page may have changed. Try refreshing browser state.`;
515
+ logger.warning(`⚠️ ${msg}`);
516
+ return new ActionResult({
517
+ extracted_content: msg,
518
+ });
274
519
  }
275
- await browser_session._input_text_element_node(element, params.text, {
520
+ const isAutocompleteField = (node) => {
521
+ const attrs = node?.attributes ?? {};
522
+ const role = String(attrs.role ?? '').toLowerCase();
523
+ const ariaAutocomplete = String(attrs['aria-autocomplete'] ?? '').toLowerCase();
524
+ const hasDatalist = String(attrs.list ?? '').trim().length > 0;
525
+ return (role === 'combobox' ||
526
+ (ariaAutocomplete !== '' && ariaAutocomplete !== 'none') ||
527
+ hasDatalist);
528
+ };
529
+ const needsAutocompleteDelay = (node) => {
530
+ const attrs = node?.attributes ?? {};
531
+ const role = String(attrs.role ?? '').toLowerCase();
532
+ const ariaAutocomplete = String(attrs['aria-autocomplete'] ?? '').toLowerCase();
533
+ return (role === 'combobox' ||
534
+ (ariaAutocomplete !== '' && ariaAutocomplete !== 'none'));
535
+ };
536
+ await dispatchBrowserEventIfAvailable(browser_session, new TypeTextEvent({
537
+ node: element,
538
+ text: params.text,
539
+ clear: params.clear ?? true,
540
+ }), () => browser_session._input_text_element_node(element, params.text, {
541
+ clear: params.clear,
276
542
  signal,
277
- });
278
- const msg = has_sensitive_data
279
- ? `⌨️ Input sensitive data into index ${params.index}`
280
- : `⌨️ Input ${params.text} into index ${params.index}`;
543
+ }));
544
+ let actualValue = null;
545
+ try {
546
+ const locator = await browser_session.get_locate_element?.(element);
547
+ if (locator && typeof locator.inputValue === 'function') {
548
+ const value = await locator.inputValue();
549
+ actualValue = typeof value === 'string' ? value : null;
550
+ }
551
+ }
552
+ catch {
553
+ actualValue = null;
554
+ }
555
+ let msg = `⌨️ Input ${params.text} into index ${params.index}`;
556
+ if (has_sensitive_data) {
557
+ const sensitiveKeyName = detectSensitiveKeyName(params.text, sensitive_data ?? null);
558
+ msg = sensitiveKeyName
559
+ ? `Typed ${sensitiveKeyName}`
560
+ : 'Typed sensitive data';
561
+ }
562
+ if (!has_sensitive_data &&
563
+ actualValue != null &&
564
+ actualValue !== params.text) {
565
+ msg +=
566
+ `\n⚠️ Note: the field's actual value '${actualValue}' differs from typed text '${params.text}'. ` +
567
+ 'The page may have reformatted or autocompleted your input.';
568
+ }
569
+ if (isAutocompleteField(element)) {
570
+ msg +=
571
+ '\n💡 This is an autocomplete field. Wait for suggestions to appear, then click the correct suggestion instead of pressing Enter.';
572
+ if (needsAutocompleteDelay(element)) {
573
+ await waitWithSignal(400, signal);
574
+ }
575
+ }
281
576
  return new ActionResult({
282
577
  extracted_content: msg,
283
578
  include_in_memory: true,
284
- long_term_memory: `Input '${params.text}' into element ${params.index}.`,
579
+ long_term_memory: msg,
580
+ });
581
+ };
582
+ this.registry.action('Click and input text into an input interactive element', { param_model: InputTextActionSchema })(async function input_text(params, { browser_session, has_sensitive_data, sensitive_data, signal }) {
583
+ return inputImpl(params, {
584
+ browser_session,
585
+ has_sensitive_data,
586
+ sensitive_data,
587
+ signal,
588
+ });
589
+ });
590
+ this.registry.action('Click and input text into an input interactive element', { param_model: InputTextActionSchema })(async function input(params, { browser_session, has_sensitive_data, sensitive_data, signal }) {
591
+ return inputImpl(params, {
592
+ browser_session,
593
+ has_sensitive_data,
594
+ sensitive_data,
595
+ signal,
285
596
  });
286
597
  });
287
598
  this.registry.action('Upload file to interactive element with file path', {
288
599
  param_model: UploadFileActionSchema,
289
- })(async function upload_file(params, { browser_session, available_file_paths, signal }) {
600
+ })(async function upload_file(params, { browser_session, available_file_paths, file_system, signal }) {
290
601
  if (!browser_session)
291
602
  throw new Error('Browser session missing');
292
603
  throwIfAborted(signal);
293
- if (!available_file_paths?.includes(params.path)) {
294
- throw new BrowserError(`File path ${params.path} is not available`);
604
+ let uploadPath = params.path;
605
+ const isLocalBrowser = browser_session?.is_local !== false;
606
+ const allowedPaths = new Set(available_file_paths ?? []);
607
+ const downloadedFiles = Array.isArray(browser_session?.downloaded_files)
608
+ ? browser_session.downloaded_files
609
+ : [];
610
+ for (const downloadedPath of downloadedFiles) {
611
+ allowedPaths.add(downloadedPath);
295
612
  }
296
- if (!fs.existsSync(params.path)) {
297
- throw new BrowserError(`File ${params.path} does not exist`);
613
+ if (!allowedPaths.has(uploadPath)) {
614
+ const fsInstance = file_system ?? null;
615
+ const managedFile = fsInstance && typeof fsInstance.get_file === 'function'
616
+ ? fsInstance.get_file(uploadPath)
617
+ : null;
618
+ if (managedFile && fsInstance?.get_dir) {
619
+ uploadPath = path.join(fsInstance.get_dir(), uploadPath);
620
+ }
621
+ else if (!isLocalBrowser) {
622
+ // Remote browser paths may only exist on the remote runtime.
623
+ }
624
+ else {
625
+ return new ActionResult({
626
+ error: `File path ${params.path} is not available. To fix: add this file path to available_file_paths when creating the Agent.`,
627
+ });
628
+ }
298
629
  }
299
- const node = await browser_session.find_file_upload_element_by_index(params.index, 3, 3, { signal });
300
- if (!node) {
301
- throw new BrowserError(`No file upload element found at index ${params.index}`);
630
+ if (isLocalBrowser) {
631
+ if (!fs.existsSync(uploadPath)) {
632
+ return new ActionResult({
633
+ error: `File ${uploadPath} does not exist`,
634
+ });
635
+ }
636
+ if (fs.statSync(uploadPath).size === 0) {
637
+ return new ActionResult({
638
+ error: `File ${uploadPath} is empty (0 bytes). The file may not have been saved correctly.`,
639
+ });
640
+ }
641
+ }
642
+ let selectorMap = null;
643
+ if (typeof browser_session.get_selector_map === 'function') {
644
+ selectorMap = await browser_session.get_selector_map({ signal });
645
+ if (!(params.index in (selectorMap ?? {}))) {
646
+ return new ActionResult({
647
+ error: `Element with index ${params.index} does not exist.`,
648
+ });
649
+ }
302
650
  }
303
- const locator = await browser_session.get_locate_element(node);
304
- if (!locator) {
305
- throw new BrowserError(`No file upload element found at index ${params.index}`);
651
+ let node = await browser_session.find_file_upload_element_by_index(params.index, 3, 3, { signal });
652
+ if (!node &&
653
+ selectorMap &&
654
+ typeof browser_session.is_file_input === 'function') {
655
+ let currentScrollY = 0;
656
+ try {
657
+ const page = await browser_session.get_current_page?.();
658
+ if (page?.evaluate) {
659
+ const evaluated = await page.evaluate(() => window.scrollY || window.pageYOffset || 0);
660
+ const numeric = typeof evaluated === 'number' ? evaluated : Number(evaluated);
661
+ if (Number.isFinite(numeric)) {
662
+ currentScrollY = numeric;
663
+ }
664
+ }
665
+ }
666
+ catch {
667
+ currentScrollY = 0;
668
+ }
669
+ let closest = null;
670
+ let minDistance = Number.POSITIVE_INFINITY;
671
+ for (const element of Object.values(selectorMap)) {
672
+ if (!browser_session.is_file_input(element)) {
673
+ continue;
674
+ }
675
+ const y = Number(element?.absolute_position?.y ?? 0);
676
+ const distance = Number.isFinite(y)
677
+ ? Math.abs(y - currentScrollY)
678
+ : 0;
679
+ if (!closest || distance < minDistance) {
680
+ closest = element;
681
+ minDistance = distance;
682
+ }
683
+ }
684
+ if (closest) {
685
+ node = closest;
686
+ }
306
687
  }
307
- await locator.setInputFiles(params.path);
688
+ if (!node) {
689
+ throw new BrowserError('No file upload element found on the page');
690
+ }
691
+ await dispatchBrowserEventIfAvailable(browser_session, new UploadFileEvent({
692
+ node,
693
+ file_path: uploadPath,
694
+ }), async () => {
695
+ const locator = await browser_session.get_locate_element(node);
696
+ if (!locator) {
697
+ throw new BrowserError('No file upload element found on the page');
698
+ }
699
+ await locator.setInputFiles(uploadPath);
700
+ return null;
701
+ });
308
702
  const msg = `📁 Successfully uploaded file to index ${params.index}`;
309
703
  return new ActionResult({
310
704
  extracted_content: msg,
311
705
  include_in_memory: true,
312
- long_term_memory: `Uploaded file ${params.path} to element ${params.index}`,
706
+ long_term_memory: `Uploaded file ${uploadPath} to element ${params.index}`,
313
707
  });
314
708
  });
315
709
  }
710
+ registerClickActions() {
711
+ const clickActionHandler = this.clickActionHandler;
712
+ if (!clickActionHandler) {
713
+ return;
714
+ }
715
+ const removeAction = this.registry?.remove_action;
716
+ if (typeof removeAction === 'function') {
717
+ removeAction.call(this.registry, 'click');
718
+ removeAction.call(this.registry, 'click_element_by_index');
719
+ }
720
+ const registerIndexAlias = () => {
721
+ this.registry.action('Click element by index.', {
722
+ param_model: ClickElementActionIndexOnlySchema,
723
+ action_name: 'click_element_by_index',
724
+ })(async (params, ctx) => {
725
+ return await clickActionHandler(params, ctx);
726
+ });
727
+ };
728
+ if (this.coordinateClickingEnabled) {
729
+ this.registry.action('Click element by index or coordinates. Use coordinates only if the index is not available. Either provide coordinates or index.', {
730
+ param_model: ClickElementActionSchema,
731
+ action_name: 'click',
732
+ })(async (params, ctx) => {
733
+ return await clickActionHandler(params, ctx);
734
+ });
735
+ registerIndexAlias();
736
+ return;
737
+ }
738
+ this.registry.action('Click element by index.', {
739
+ param_model: ClickElementActionIndexOnlySchema,
740
+ action_name: 'click',
741
+ })(async (params, ctx) => {
742
+ return await clickActionHandler(params, ctx);
743
+ });
744
+ registerIndexAlias();
745
+ }
316
746
  registerTabActions() {
317
- this.registry.action('Switch tab', { param_model: SwitchTabActionSchema })(async function switch_tab(params, ctx) {
318
- const { browser_session, signal } = ctx;
747
+ const tabLogger = this.logger;
748
+ const resolveTabIdentifier = (params) => {
749
+ if (typeof params.tab_id === 'string' && params.tab_id.trim()) {
750
+ return params.tab_id.trim();
751
+ }
752
+ if (typeof params.page_id === 'number' &&
753
+ Number.isFinite(params.page_id)) {
754
+ return params.page_id;
755
+ }
756
+ return -1;
757
+ };
758
+ const formatTabId = (identifier, browser_session) => {
759
+ if (typeof identifier === 'string' && identifier.trim()) {
760
+ return identifier.trim();
761
+ }
762
+ const numericIdentifier = typeof identifier === 'number' && Number.isFinite(identifier)
763
+ ? Math.floor(identifier)
764
+ : -1;
765
+ if (numericIdentifier >= 0) {
766
+ const matchedTab = Array.isArray(browser_session?.tabs)
767
+ ? browser_session.tabs.find((tab) => tab?.page_id === numericIdentifier)
768
+ : null;
769
+ const matchedTabId = typeof matchedTab?.tab_id === 'string' && matchedTab.tab_id.trim()
770
+ ? matchedTab.tab_id.trim()
771
+ : null;
772
+ return (matchedTabId ?? String(numericIdentifier).padStart(4, '0').slice(-4));
773
+ }
774
+ return 'unknown';
775
+ };
776
+ const switchImpl = async function (params, { browser_session, signal, }) {
319
777
  if (!browser_session)
320
778
  throw new Error('Browser session missing');
321
779
  throwIfAborted(signal);
322
- await browser_session.switch_to_tab(params.page_id, { signal });
323
- const page = await browser_session.get_current_page();
780
+ const identifier = resolveTabIdentifier(params);
781
+ const tabId = formatTabId(identifier, browser_session);
324
782
  try {
325
- await page?.wait_for_load_state?.('domcontentloaded', {
326
- timeout: 5000,
783
+ const switchTargetId = identifier === -1 ? null : String(identifier).trim();
784
+ await dispatchBrowserEventIfAvailable(browser_session, new SwitchTabEvent({ target_id: switchTargetId }), () => browser_session.switch_to_tab(identifier, { signal }));
785
+ const page = await browser_session.get_current_page();
786
+ try {
787
+ await page?.wait_for_load_state?.('domcontentloaded', {
788
+ timeout: 5000,
789
+ });
790
+ }
791
+ catch {
792
+ /* ignore */
793
+ }
794
+ const memory = `Switched to tab #${tabId}`;
795
+ return new ActionResult({
796
+ extracted_content: memory,
797
+ long_term_memory: memory,
327
798
  });
328
799
  }
329
- catch {
330
- /* ignore */
800
+ catch (error) {
801
+ tabLogger.warning(`Tab switch may have failed: ${error.message}`);
802
+ const memory = `Attempted to switch to tab #${tabId}`;
803
+ return new ActionResult({
804
+ extracted_content: memory,
805
+ long_term_memory: memory,
806
+ });
331
807
  }
332
- const msg = `🔄 Switched to tab #${params.page_id} with url ${page?.url ?? ''}`;
333
- return new ActionResult({
334
- extracted_content: msg,
335
- include_in_memory: true,
336
- long_term_memory: `Switched to tab ${params.page_id}`,
337
- });
808
+ };
809
+ this.registry.action('Switch tab', {
810
+ param_model: SwitchTabActionSchema,
811
+ terminates_sequence: true,
812
+ })(async function switch_tab(params, { browser_session, signal }) {
813
+ return switchImpl(params, { browser_session, signal });
338
814
  });
339
- this.registry.action('Close an existing tab', {
340
- param_model: CloseTabActionSchema,
341
- })(async function close_tab(params, { browser_session, signal }) {
815
+ this.registry.action('Switch tab', {
816
+ param_model: SwitchTabActionSchema,
817
+ terminates_sequence: true,
818
+ action_name: 'switch',
819
+ })(async function switch_alias(params, { browser_session, signal }) {
820
+ return switchImpl(params, { browser_session, signal });
821
+ });
822
+ const closeImpl = async function (params, { browser_session, signal, }) {
342
823
  if (!browser_session)
343
824
  throw new Error('Browser session missing');
344
825
  throwIfAborted(signal);
345
- await browser_session.switch_to_tab(params.page_id, { signal });
346
- const page = await browser_session.get_current_page();
347
- const url = page?.url ?? '';
348
- await page?.close?.();
349
- const newPage = await browser_session.get_current_page();
350
- const newIndex = browser_session.active_tab_index;
351
- const msg = `❌ Closed tab #${params.page_id} with ${url}, now focused on tab #${newIndex} with url ${newPage?.url ?? ''}`;
352
- return new ActionResult({
353
- extracted_content: msg,
354
- include_in_memory: true,
355
- long_term_memory: `Closed tab ${params.page_id} with url ${url}, now focused on tab ${newIndex} with url ${newPage?.url ?? ''}.`,
356
- });
826
+ const identifier = resolveTabIdentifier(params);
827
+ const closedTabId = formatTabId(identifier, browser_session);
828
+ try {
829
+ const resolvedCloseTargetId = identifier === -1
830
+ ? (browser_session?.active_tab?.target_id ??
831
+ browser_session?.active_tab?.tab_id ??
832
+ null)
833
+ : String(identifier).trim();
834
+ if (!resolvedCloseTargetId) {
835
+ throw new Error('Could not resolve target tab to close');
836
+ }
837
+ await dispatchBrowserEventIfAvailable(browser_session, new CloseTabEvent({ target_id: resolvedCloseTargetId }), () => browser_session.close_tab(identifier));
838
+ const memory = `Closed tab #${closedTabId}`;
839
+ return new ActionResult({
840
+ extracted_content: memory,
841
+ long_term_memory: memory,
842
+ });
843
+ }
844
+ catch (error) {
845
+ tabLogger.warning(`Tab ${closedTabId} may already be closed: ${error.message}`);
846
+ const memory = `Tab #${closedTabId} closed (was already closed or invalid)`;
847
+ return new ActionResult({
848
+ extracted_content: memory,
849
+ long_term_memory: memory,
850
+ });
851
+ }
852
+ };
853
+ this.registry.action('Close an existing tab', {
854
+ param_model: CloseTabActionSchema,
855
+ })(async function close_tab(params, { browser_session, signal }) {
856
+ return closeImpl(params, { browser_session, signal });
857
+ });
858
+ this.registry.action('Close an existing tab', {
859
+ param_model: CloseTabActionSchema,
860
+ })(async function close(params, { browser_session, signal }) {
861
+ return closeImpl(params, { browser_session, signal });
357
862
  });
358
863
  }
359
864
  registerContentActions() {
360
- this.registry.action('Extract structured, semantic data from the current webpage based on a textual query.', {
865
+ const registry = this.registry;
866
+ const contentLogger = this.logger;
867
+ const extractStructuredDescription = "LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Use start_from_char if previous extraction was truncated to extract data further down the page.";
868
+ this.registry.action(extractStructuredDescription, {
361
869
  param_model: ExtractStructuredDataActionSchema,
362
- })(async function extract_structured_data(params, { page, page_extraction_llm, file_system, signal }) {
870
+ })(async function extract_structured_data(params, { page, page_extraction_llm, extraction_schema, file_system, signal }) {
363
871
  throwIfAborted(signal);
364
872
  if (!page) {
365
873
  throw new BrowserError('No active page available for extraction.');
@@ -368,105 +876,457 @@ export class Controller {
368
876
  throw new BrowserError('page_extraction_llm is not configured.');
369
877
  }
370
878
  const fsInstance = file_system ?? new FileSystem(process.cwd(), false);
371
- const html = await page.content?.();
372
- if (!html) {
879
+ const pageHtml = await runWithTimeoutAndSignal(async () => {
880
+ const value = await page.content?.();
881
+ return typeof value === 'string' ? value : '';
882
+ }, 10000, signal, 'Page content extraction timed out');
883
+ if (!pageHtml) {
373
884
  throw new BrowserError('Unable to extract page content.');
374
885
  }
375
- const turndown = new TurndownService({
376
- headingStyle: 'atx',
377
- codeBlockStyle: 'fenced',
378
- });
379
- let rawHtml = html;
380
- if (!params.extract_links) {
381
- rawHtml = rawHtml.replace(/<a\b[^>]*>/gi, '').replace(/<\/a>/gi, '');
382
- }
383
- let content = turndown.turndown(rawHtml);
384
- content = content.replace(/\n+/g, '\n');
385
- // Manually append iframe text into the content so it's readable by the LLM (includes cross-origin iframes)
386
- const frames = page.frames?.() || [];
886
+ let combinedHtml = pageHtml;
887
+ const frames = typeof page.frames === 'function'
888
+ ? page.frames()
889
+ : Array.isArray(page.frames)
890
+ ? page.frames
891
+ : [];
892
+ const currentUrl = (() => {
893
+ const pageUrlValue = page.url;
894
+ if (typeof pageUrlValue === 'function') {
895
+ return String(pageUrlValue.call(page) ?? '');
896
+ }
897
+ return typeof pageUrlValue === 'string' ? pageUrlValue : '';
898
+ })();
387
899
  for (const iframe of frames) {
388
900
  throwIfAborted(signal);
389
901
  try {
390
- // Wait for iframe to load with aggressive timeout
391
902
  await runWithTimeoutAndSignal(async () => {
392
903
  await iframe.waitForLoadState?.('load');
393
- }, 2000, signal, 'Iframe load timeout');
904
+ }, 1000, signal, 'Iframe load timeout');
394
905
  }
395
906
  catch (error) {
396
907
  if (isAbortError(error)) {
397
908
  throw error;
398
909
  }
399
- // Ignore iframe load errors
400
- }
401
- const iframeUrl = iframe.url?.();
402
- const pageUrl = page.url?.();
403
- if (iframeUrl &&
404
- pageUrl &&
405
- iframeUrl !== pageUrl &&
406
- !iframeUrl.startsWith('data:') &&
407
- !iframeUrl.startsWith('about:')) {
408
- content += `\n\nIFRAME ${iframeUrl}:\n`;
409
- try {
410
- const iframeHtml = await runWithTimeoutAndSignal(async () => (await iframe.content?.()) ?? '', 2000, signal, 'Iframe content extraction timeout');
411
- const iframeMarkdown = turndown.turndown(iframeHtml || '');
412
- content += iframeMarkdown;
910
+ }
911
+ const iframeUrl = typeof iframe.url === 'function'
912
+ ? iframe.url()
913
+ : typeof iframe.url === 'string'
914
+ ? iframe.url
915
+ : '';
916
+ if (!iframeUrl ||
917
+ iframeUrl === currentUrl ||
918
+ iframeUrl.startsWith('data:') ||
919
+ iframeUrl.startsWith('about:')) {
920
+ continue;
921
+ }
922
+ try {
923
+ const iframeHtml = await runWithTimeoutAndSignal(async () => {
924
+ const value = await iframe.content?.();
925
+ return typeof value === 'string' ? value : '';
926
+ }, 2000, signal, 'Iframe content extraction timeout');
927
+ if (!iframeHtml) {
928
+ continue;
413
929
  }
414
- catch (error) {
415
- if (isAbortError(error)) {
416
- throw error;
417
- }
418
- // Skip failed iframes
930
+ combinedHtml += `\n<section><h2>IFRAME ${iframeUrl}</h2>${iframeHtml}</section>`;
931
+ }
932
+ catch (error) {
933
+ if (isAbortError(error)) {
934
+ throw error;
419
935
  }
420
936
  }
421
937
  }
422
- // Replace multiple sequential \n with a single \n
423
- content = content.replace(/\n+/g, '\n');
424
- const maxChars = 30000;
425
- if (content.length > maxChars) {
426
- const head = content.slice(0, maxChars / 2);
427
- const tail = content.slice(-maxChars / 2);
428
- content = `${head}\n... left out the middle because it was too long ...\n${tail}`;
938
+ const extracted = extractCleanMarkdownFromHtml(combinedHtml, {
939
+ extract_links: params.extract_links,
940
+ method: 'page_content',
941
+ url: currentUrl || undefined,
942
+ });
943
+ let content = extracted.content;
944
+ const contentStats = extracted.stats;
945
+ const finalFilteredLength = contentStats.final_filtered_chars;
946
+ const startFromChar = Math.max(0, params.start_from_char ?? 0);
947
+ const maxChars = 100000;
948
+ const chunks = chunkMarkdownByStructure(content, maxChars, 5, startFromChar);
949
+ if (!chunks.length) {
950
+ return new ActionResult({
951
+ error: `start_from_char (${startFromChar}) exceeds content length ${finalFilteredLength} characters.`,
952
+ });
953
+ }
954
+ const chunk = chunks[0];
955
+ content = chunk.content;
956
+ const wasTruncated = chunk.has_more;
957
+ if (chunk.overlap_prefix) {
958
+ content = `${chunk.overlap_prefix}\n${content}`;
959
+ }
960
+ if (startFromChar > 0) {
961
+ contentStats.started_from_char = startFromChar;
962
+ }
963
+ if (wasTruncated) {
964
+ contentStats.truncated_at_char = chunk.char_offset_end;
965
+ contentStats.next_start_char = chunk.char_offset_end;
966
+ contentStats.chunk_index = chunk.chunk_index;
967
+ contentStats.total_chunks = chunk.total_chunks;
968
+ }
969
+ const originalHtmlLength = contentStats.original_html_chars;
970
+ const initialMarkdownLength = contentStats.initial_markdown_chars;
971
+ const charsFiltered = contentStats.filtered_chars_removed;
972
+ let statsSummary = `Content processed: ${originalHtmlLength.toLocaleString()} HTML chars ` +
973
+ `→ ${initialMarkdownLength.toLocaleString()} initial markdown ` +
974
+ `→ ${finalFilteredLength.toLocaleString()} filtered markdown`;
975
+ if (startFromChar > 0) {
976
+ statsSummary += ` (started from char ${startFromChar.toLocaleString()})`;
977
+ }
978
+ if (wasTruncated &&
979
+ contentStats.next_start_char != null &&
980
+ contentStats.chunk_index != null &&
981
+ contentStats.total_chunks != null) {
982
+ const chunkInfo = `chunk ${contentStats.chunk_index + 1} of ${contentStats.total_chunks}, `;
983
+ statsSummary +=
984
+ ` → ${content.length.toLocaleString()} final chars ` +
985
+ `(${chunkInfo}use start_from_char=${contentStats.next_start_char} to continue)`;
986
+ }
987
+ else if (charsFiltered > 0) {
988
+ statsSummary += ` (filtered ${charsFiltered.toLocaleString()} chars of noise)`;
989
+ }
990
+ content = sanitize_surrogates(content);
991
+ const sanitizedQuery = sanitize_surrogates(params.query);
992
+ const parseJsonFromCompletion = (completion) => {
993
+ const trimmed = completion.trim();
994
+ const fencedMatch = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/i);
995
+ const candidate = fencedMatch?.[1]?.trim() || trimmed;
996
+ return JSON.parse(candidate);
997
+ };
998
+ let effectiveOutputSchema = params.output_schema ?? extraction_schema;
999
+ if (effectiveOutputSchema != null) {
1000
+ const unsupportedKeyword = findUnsupportedJsonSchemaKeyword(effectiveOutputSchema);
1001
+ if (unsupportedKeyword) {
1002
+ contentLogger.warning(`Invalid output_schema, falling back to free-text extraction: unsupported keyword '${unsupportedKeyword}'`);
1003
+ effectiveOutputSchema = null;
1004
+ }
1005
+ }
1006
+ const pageUrl = currentUrl || '';
1007
+ const maxMemoryLength = 10000;
1008
+ if (effectiveOutputSchema != null) {
1009
+ const systemPrompt = `
1010
+ You are an expert at extracting structured data from the markdown of a webpage.
1011
+
1012
+ <input>
1013
+ You will be given a query, a JSON Schema, and the markdown of a webpage that has been filtered to remove noise and advertising content.
1014
+ </input>
1015
+
1016
+ <instructions>
1017
+ - Extract ONLY information present in the webpage. Do not guess or fabricate values.
1018
+ - Your response MUST conform to the provided JSON Schema exactly.
1019
+ - If a required field's value cannot be found on the page, use null (if the schema allows it) or an empty string / empty array as appropriate.
1020
+ - If the content was truncated, extract what is available from the visible portion.
1021
+ </instructions>`.trim();
1022
+ const schemaJson = JSON.stringify(effectiveOutputSchema, null, 2);
1023
+ const prompt = `<query>\n${sanitizedQuery}\n</query>\n\n` +
1024
+ `<output_schema>\n${schemaJson}\n</output_schema>\n\n` +
1025
+ `<content_stats>\n${statsSummary}\n</content_stats>\n\n` +
1026
+ `<webpage_content>\n${content}\n</webpage_content>`;
1027
+ const response = await page_extraction_llm.ainvoke([new SystemMessage(systemPrompt), new UserMessage(prompt)], undefined, { signal: signal ?? undefined });
1028
+ throwIfAborted(signal);
1029
+ const completion = response?.completion;
1030
+ const completionText = typeof completion === 'string'
1031
+ ? completion
1032
+ : JSON.stringify(completion ?? {});
1033
+ let parsedResult;
1034
+ try {
1035
+ parsedResult = parseJsonFromCompletion(completionText);
1036
+ }
1037
+ catch (error) {
1038
+ throw new BrowserError(`Structured extraction returned invalid JSON: ${error.message}`);
1039
+ }
1040
+ const schemaValidation = validateJsonSchema(parsedResult, effectiveOutputSchema);
1041
+ if (!schemaValidation.valid) {
1042
+ const details = (schemaValidation.errors ?? [])
1043
+ .slice(0, 3)
1044
+ .map((item) => String(item?.error ?? '').trim())
1045
+ .filter(Boolean)
1046
+ .join('; ');
1047
+ const suffix = details ? `: ${details}` : '';
1048
+ throw new BrowserError(`Structured extraction result does not match output_schema${suffix}`);
1049
+ }
1050
+ const normalizedResult = normalizeStructuredDataBySchema(parsedResult, effectiveOutputSchema);
1051
+ const resultJson = JSON.stringify(normalizedResult);
1052
+ const extractedContent = `<url>\n${pageUrl}\n</url>\n` +
1053
+ `<query>\n${sanitizedQuery}\n</query>\n` +
1054
+ `<structured_result>\n${resultJson}\n</structured_result>`;
1055
+ const extractionMeta = {
1056
+ data: normalizedResult,
1057
+ schema_used: effectiveOutputSchema,
1058
+ is_partial: wasTruncated,
1059
+ source_url: pageUrl,
1060
+ content_stats: contentStats,
1061
+ };
1062
+ const includeOnce = extractedContent.length >= maxMemoryLength;
1063
+ const memory = includeOnce
1064
+ ? `Query: ${sanitizedQuery}\nContent in ${await fsInstance.save_extracted_content(extractedContent)} and once in <read_state>.`
1065
+ : extractedContent;
1066
+ return new ActionResult({
1067
+ extracted_content: extractedContent,
1068
+ include_extracted_content_only_once: includeOnce,
1069
+ long_term_memory: memory,
1070
+ metadata: {
1071
+ structured_extraction: true,
1072
+ extraction_result: extractionMeta,
1073
+ },
1074
+ });
429
1075
  }
430
- const prompt = `You convert websites into structured information. Extract information from this webpage based on the query. Focus only on content relevant to the query. If
431
- 1. The query is vague
432
- 2. Does not make sense for the page
433
- 3. Some/all of the information is not available
1076
+ const systemPrompt = `
1077
+ You are an expert at extracting data from the markdown of a webpage.
434
1078
 
435
- Explain the content of the page and that the requested information is not available in the page. Respond in JSON format.
436
- Query: ${params.query}
437
- Website:
438
- ${content}`;
439
- const extraction = await page_extraction_llm.ainvoke([new UserMessage(prompt)], undefined, { signal: signal ?? undefined });
1079
+ <input>
1080
+ You will be given a query and the markdown of a webpage that has been filtered to remove noise and advertising content.
1081
+ </input>
1082
+
1083
+ <instructions>
1084
+ - You are tasked to extract information from the webpage that is relevant to the query.
1085
+ - You should ONLY use the information available in the webpage to answer the query. Do not make up information or provide guess from your own knowledge.
1086
+ - If the information relevant to the query is not available in the page, your response should mention that.
1087
+ - If the query asks for all items, products, etc., make sure to directly list all of them.
1088
+ - If the content was truncated and you need more information, note that the user can use start_from_char parameter to continue from where truncation occurred.
1089
+ </instructions>
1090
+
1091
+ <output>
1092
+ - Your output should present ALL the information relevant to the query in a concise way.
1093
+ - Do not answer in conversational format - directly output the relevant information or that the information is unavailable.
1094
+ </output>`.trim();
1095
+ const prompt = `<query>\n${sanitizedQuery}\n</query>\n\n` +
1096
+ `<content_stats>\n${statsSummary}\n</content_stats>\n\n` +
1097
+ `<webpage_content>\n${content}\n</webpage_content>`;
1098
+ const response = await page_extraction_llm.ainvoke([new SystemMessage(systemPrompt), new UserMessage(prompt)], undefined, { signal: signal ?? undefined });
440
1099
  throwIfAborted(signal);
441
- const completion = extraction?.completion ?? '';
442
- const extracted_content = `Page Link: ${page.url}\nQuery: ${params.query}\nExtracted Content:\n${completion}`;
443
- let includeOnce = false;
444
- let memory = extracted_content;
445
- const MAX_MEMORY_SIZE = 600;
446
- if (extracted_content.length > MAX_MEMORY_SIZE) {
447
- const lines = extracted_content.split('\n');
448
- let display = '';
449
- let count = 0;
450
- for (const line of lines) {
451
- if (display.length + line.length > MAX_MEMORY_SIZE)
452
- break;
453
- display += `${line}\n`;
454
- count += 1;
1100
+ const completion = response?.completion;
1101
+ const completionText = typeof completion === 'string'
1102
+ ? completion
1103
+ : JSON.stringify(completion ?? {});
1104
+ const extractedContent = `<url>\n${pageUrl}\n</url>\n` +
1105
+ `<query>\n${sanitizedQuery}\n</query>\n` +
1106
+ `<result>\n${completionText}\n</result>`;
1107
+ const includeOnce = extractedContent.length >= maxMemoryLength;
1108
+ const memory = includeOnce
1109
+ ? `Query: ${sanitizedQuery}\nContent in ${await fsInstance.save_extracted_content(extractedContent)} and once in <read_state>.`
1110
+ : extractedContent;
1111
+ return new ActionResult({
1112
+ extracted_content: extractedContent,
1113
+ include_extracted_content_only_once: includeOnce,
1114
+ long_term_memory: memory,
1115
+ });
1116
+ });
1117
+ this.registry.action(extractStructuredDescription, {
1118
+ param_model: ExtractStructuredDataActionSchema,
1119
+ action_name: 'extract',
1120
+ })(async function extract(params, { browser_session, page_extraction_llm, extraction_schema, file_system, available_file_paths, sensitive_data, signal, }) {
1121
+ return registry.execute_action('extract_structured_data', params, {
1122
+ browser_session,
1123
+ page_extraction_llm,
1124
+ extraction_schema,
1125
+ file_system,
1126
+ available_file_paths,
1127
+ sensitive_data,
1128
+ signal,
1129
+ });
1130
+ });
1131
+ }
1132
+ registerExplorationActions() {
1133
+ this.registry.action('Search page text for a pattern (like grep). Zero LLM cost and instant.', { param_model: SearchPageActionSchema })(async function search_page(params, { browser_session, signal }) {
1134
+ if (!browser_session)
1135
+ throw new Error('Browser session missing');
1136
+ throwIfAborted(signal);
1137
+ const page = await browser_session.get_current_page();
1138
+ if (!page?.evaluate) {
1139
+ throw new BrowserError('No active page for search_page.');
1140
+ }
1141
+ const searchResult = (await page.evaluate(({ pattern, regex, caseSensitive, contextChars, cssScope, maxResults, }) => {
1142
+ const sourceNode = cssScope
1143
+ ? document.querySelector(cssScope)
1144
+ : document.body;
1145
+ if (!sourceNode) {
1146
+ return {
1147
+ error: `CSS scope not found: ${cssScope}`,
1148
+ matches: [],
1149
+ total: 0,
1150
+ };
1151
+ }
1152
+ const sourceText = sourceNode.innerText ||
1153
+ sourceNode.textContent ||
1154
+ '';
1155
+ if (!sourceText.trim()) {
1156
+ return {
1157
+ matches: [],
1158
+ total: 0,
1159
+ };
1160
+ }
1161
+ const safePattern = regex
1162
+ ? pattern
1163
+ : pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1164
+ const flags = caseSensitive ? 'g' : 'gi';
1165
+ let matcher;
1166
+ try {
1167
+ matcher = new RegExp(safePattern, flags);
455
1168
  }
456
- const saveResult = await fsInstance.save_extracted_content(extracted_content);
457
- // NOTE: Do NOT mention file_system tag here as it misleads LLM to use read_file action
458
- // The extracted content preview above is sufficient for most tasks
459
- memory = `Extracted content from ${page.url}\n<query>${params.query}</query>\n<extracted_content>\n${display}${lines.length - count} more lines (auto-saved, no need to read)...\n</extracted_content>`;
460
- includeOnce = true;
1169
+ catch (error) {
1170
+ return {
1171
+ error: `Invalid regex pattern: ${String(error)}`,
1172
+ matches: [],
1173
+ total: 0,
1174
+ };
1175
+ }
1176
+ const matches = [];
1177
+ let foundTotal = 0;
1178
+ let m;
1179
+ while ((m = matcher.exec(sourceText)) !== null) {
1180
+ foundTotal += 1;
1181
+ if (matches.length < Math.max(1, maxResults)) {
1182
+ const start = Math.max(0, m.index - Math.max(0, contextChars));
1183
+ const end = Math.min(sourceText.length, m.index + m[0].length + Math.max(0, contextChars));
1184
+ matches.push({
1185
+ position: m.index,
1186
+ match: m[0],
1187
+ snippet: sourceText.slice(start, end),
1188
+ });
1189
+ }
1190
+ if (m[0].length === 0) {
1191
+ matcher.lastIndex += 1;
1192
+ }
1193
+ }
1194
+ return {
1195
+ matches,
1196
+ total: foundTotal,
1197
+ truncated: foundTotal > matches.length,
1198
+ };
1199
+ }, {
1200
+ pattern: params.pattern,
1201
+ regex: params.regex,
1202
+ caseSensitive: params.case_sensitive,
1203
+ contextChars: params.context_chars,
1204
+ cssScope: params.css_scope ?? null,
1205
+ maxResults: params.max_results,
1206
+ }));
1207
+ if (!searchResult) {
1208
+ return new ActionResult({ error: 'search_page returned no result' });
1209
+ }
1210
+ if (searchResult.error) {
1211
+ return new ActionResult({
1212
+ error: `search_page: ${searchResult.error}`,
1213
+ });
1214
+ }
1215
+ const total = searchResult.total ?? 0;
1216
+ const matches = searchResult.matches ?? [];
1217
+ if (total === 0 || !matches.length) {
1218
+ const noMatchMessage = `No matches found for "${params.pattern}".`;
1219
+ return new ActionResult({
1220
+ extracted_content: noMatchMessage,
1221
+ long_term_memory: `Searched page for "${params.pattern}": 0 matches found.`,
1222
+ });
1223
+ }
1224
+ const lines = [
1225
+ `Found ${total} matches for "${params.pattern}" in page text:`,
1226
+ ];
1227
+ for (let i = 0; i < matches.length; i += 1) {
1228
+ const match = matches[i];
1229
+ const compactSnippet = match.snippet.replace(/\s+/g, ' ').trim();
1230
+ lines.push(`${i + 1}. [pos ${match.position}] "${match.match}" -> ${compactSnippet}`);
1231
+ }
1232
+ if (searchResult.truncated) {
1233
+ lines.push(`... showing first ${matches.length} matches (increase max_results to see more).`);
1234
+ }
1235
+ const memory = `Searched page for "${params.pattern}": ${total} match${total === 1 ? '' : 'es'} found.`;
1236
+ return new ActionResult({
1237
+ extracted_content: lines.join('\n'),
1238
+ long_term_memory: memory,
1239
+ });
1240
+ });
1241
+ this.registry.action('Query DOM elements by CSS selector (like find). Zero LLM cost and instant.', { param_model: FindElementsActionSchema })(async function find_elements(params, { browser_session, signal }) {
1242
+ if (!browser_session)
1243
+ throw new Error('Browser session missing');
1244
+ throwIfAborted(signal);
1245
+ const page = await browser_session.get_current_page();
1246
+ if (!page?.evaluate) {
1247
+ throw new BrowserError('No active page for find_elements.');
1248
+ }
1249
+ const result = (await page.evaluate(({ selector, attributes, maxResults, includeText, }) => {
1250
+ let elements;
1251
+ try {
1252
+ elements = Array.from(document.querySelectorAll(selector));
1253
+ }
1254
+ catch (error) {
1255
+ return {
1256
+ error: `Invalid selector: ${String(error)}`,
1257
+ elements: [],
1258
+ total: 0,
1259
+ };
1260
+ }
1261
+ const selected = elements.slice(0, Math.max(1, maxResults));
1262
+ const payload = selected.map((el, idx) => {
1263
+ const attrs = {};
1264
+ if (attributes?.length) {
1265
+ for (const attr of attributes) {
1266
+ const value = el.getAttribute(attr);
1267
+ if (value != null) {
1268
+ attrs[attr] = value;
1269
+ }
1270
+ }
1271
+ }
1272
+ return {
1273
+ index: idx + 1,
1274
+ tag: el.tagName.toLowerCase(),
1275
+ text: includeText
1276
+ ? (el.textContent || '').replace(/\s+/g, ' ').trim()
1277
+ : '',
1278
+ attributes: attrs,
1279
+ };
1280
+ });
1281
+ return {
1282
+ elements: payload,
1283
+ total: elements.length,
1284
+ truncated: elements.length > selected.length,
1285
+ };
1286
+ }, {
1287
+ selector: params.selector,
1288
+ attributes: params.attributes ?? null,
1289
+ maxResults: params.max_results,
1290
+ includeText: params.include_text,
1291
+ }));
1292
+ if (!result) {
1293
+ return new ActionResult({ error: 'find_elements returned no result' });
1294
+ }
1295
+ if (result.error) {
1296
+ return new ActionResult({ error: `find_elements: ${result.error}` });
1297
+ }
1298
+ const elements = result.elements ?? [];
1299
+ const total = result.total ?? 0;
1300
+ if (!elements.length) {
1301
+ const msg = `No elements found for selector "${params.selector}".`;
1302
+ return new ActionResult({
1303
+ extracted_content: msg,
1304
+ long_term_memory: msg,
1305
+ });
1306
+ }
1307
+ const lines = [
1308
+ `Found ${total} element${total === 1 ? '' : 's'} for selector "${params.selector}":`,
1309
+ ];
1310
+ for (const el of elements) {
1311
+ const attrs = Object.entries(el.attributes || {})
1312
+ .map(([k, v]) => `${k}=${JSON.stringify(v)}`)
1313
+ .join(' ');
1314
+ const text = params.include_text && el.text
1315
+ ? ` text=${JSON.stringify(el.text)}`
1316
+ : '';
1317
+ lines.push(`${el.index}. <${el.tag}>${text}${attrs ? ` ${attrs}` : ''}`.trim());
1318
+ }
1319
+ if (result.truncated) {
1320
+ lines.push(`... showing first ${elements.length} elements (increase max_results to see more).`);
461
1321
  }
462
1322
  return new ActionResult({
463
- extracted_content,
464
- include_extracted_content_only_once: includeOnce,
465
- long_term_memory: memory,
1323
+ extracted_content: lines.join('\n'),
1324
+ long_term_memory: `Queried selector "${params.selector}" and found ${total} element${total === 1 ? '' : 's'}.`,
466
1325
  });
467
1326
  });
468
1327
  }
469
1328
  registerScrollActions() {
1329
+ const registry = this.registry;
470
1330
  const scrollLogger = this.logger; // Capture logger reference for use in named function
471
1331
  // Define the scroll handler implementation (shared by multiple action names for LLM compatibility)
472
1332
  const scrollImpl = async (params, { browser_session, signal }) => {
@@ -487,7 +1347,9 @@ ${content}`;
487
1347
  }
488
1348
  catch (error) {
489
1349
  if (i === retries - 1) {
490
- throw new Error(`Scroll failed due to an error: ${error}`);
1350
+ throw new Error(`Scroll failed due to an error: ${error}`, {
1351
+ cause: error,
1352
+ });
491
1353
  }
492
1354
  await waitWithSignal(1000, signal);
493
1355
  }
@@ -495,8 +1357,8 @@ ${content}`;
495
1357
  return 0;
496
1358
  };
497
1359
  const windowHeight = await getWindowHeight();
498
- const scrollAmount = Math.floor(windowHeight * params.num_pages);
499
- const pagesScrolled = params.num_pages;
1360
+ const pagesScrolled = params.pages ?? params.num_pages ?? 1;
1361
+ const scrollAmount = Math.floor(windowHeight * pagesScrolled);
500
1362
  const dy = params.down ? scrollAmount : -scrollAmount;
501
1363
  const direction = params.down ? 'down' : 'up';
502
1364
  let scrollTarget = 'the page';
@@ -505,7 +1367,9 @@ ${content}`;
505
1367
  try {
506
1368
  const elementNode = await browser_session.get_dom_element_by_index(params.index, { signal });
507
1369
  if (!elementNode) {
508
- throw new Error(`Element index ${params.index} does not exist - retry or use alternative actions`);
1370
+ return new ActionResult({
1371
+ error: `Element index ${params.index} not found in browser state`,
1372
+ });
509
1373
  }
510
1374
  // Try direct container scrolling (no events that might close dropdowns)
511
1375
  const containerScrollJs = `
@@ -518,8 +1382,6 @@ ${content}`;
518
1382
  return { success: false, reason: 'Element not found by XPath' };
519
1383
  }
520
1384
 
521
- console.log('[SCROLL DEBUG] Starting direct container scroll for element:', targetElement.tagName);
522
-
523
1385
  // Try to find scrollable containers in the hierarchy (starting from element itself)
524
1386
  let currentElement = targetElement;
525
1387
  let scrollSuccess = false;
@@ -533,12 +1395,6 @@ ${content}`;
533
1395
  const hasScrollableY = /(auto|scroll|overlay)/.test(computedStyle.overflowY);
534
1396
  const canScrollVertically = currentElement.scrollHeight > currentElement.clientHeight;
535
1397
 
536
- console.log('[SCROLL DEBUG] Checking element:', currentElement.tagName,
537
- 'hasScrollableY:', hasScrollableY,
538
- 'canScrollVertically:', canScrollVertically,
539
- 'scrollHeight:', currentElement.scrollHeight,
540
- 'clientHeight:', currentElement.clientHeight);
541
-
542
1398
  if (hasScrollableY && canScrollVertically) {
543
1399
  const beforeScroll = currentElement.scrollTop;
544
1400
  const maxScroll = currentElement.scrollHeight - currentElement.clientHeight;
@@ -559,14 +1415,10 @@ ${content}`;
559
1415
  const afterScroll = currentElement.scrollTop;
560
1416
  const actualScrollDelta = afterScroll - beforeScroll;
561
1417
 
562
- console.log('[SCROLL DEBUG] Scroll attempt:', currentElement.tagName,
563
- 'before:', beforeScroll, 'after:', afterScroll, 'delta:', actualScrollDelta);
564
-
565
1418
  if (Math.abs(actualScrollDelta) > 0.5) {
566
1419
  scrollSuccess = true;
567
1420
  scrolledElement = currentElement;
568
1421
  scrollDelta = actualScrollDelta;
569
- console.log('[SCROLL DEBUG] Successfully scrolled container:', currentElement.tagName, 'delta:', actualScrollDelta);
570
1422
  break;
571
1423
  }
572
1424
  }
@@ -592,7 +1444,6 @@ ${content}`;
592
1444
  };
593
1445
  } else {
594
1446
  // No container found or could scroll
595
- console.log('[SCROLL DEBUG] No scrollable container found for element');
596
1447
  return {
597
1448
  success: false,
598
1449
  reason: 'No scrollable container found',
@@ -639,7 +1490,10 @@ ${content}`;
639
1490
  scrollTarget.includes('mouse wheel failed')) {
640
1491
  scrollLogger.debug(`🔄 Performing page-level scrolling. Reason: ${scrollTarget}`);
641
1492
  try {
642
- await browser_session._scrollContainer(dy);
1493
+ await dispatchBrowserEventIfAvailable(browser_session, new ScrollEvent({
1494
+ direction,
1495
+ amount: Math.abs(dy),
1496
+ }), () => browser_session._scrollContainer(dy));
643
1497
  }
644
1498
  catch (error) {
645
1499
  // Hard fallback: always works on root scroller
@@ -709,27 +1563,33 @@ ${content}`;
709
1563
  })(async function scroll_to_text(params, { browser_session }) {
710
1564
  if (!browser_session)
711
1565
  throw new Error('Browser session missing');
712
- const page = await browser_session.get_current_page();
713
- if (!page?.evaluate) {
714
- throw new BrowserError('Unable to access page for scrolling.');
715
- }
716
- const success = await page.evaluate(({ text }) => {
717
- const iterator = document.createNodeIterator(document.body, NodeFilter.SHOW_ELEMENT);
718
- let node;
719
- while ((node = iterator.nextNode())) {
720
- const el = node;
721
- if (!el || !el.textContent)
722
- continue;
723
- if (el.textContent.toLowerCase().includes(text.toLowerCase())) {
724
- el.scrollIntoView({ behavior: 'smooth', block: 'center' });
725
- return true;
1566
+ await dispatchBrowserEventIfAvailable(browser_session, new ScrollToTextEvent({
1567
+ text: params.text,
1568
+ direction: 'down',
1569
+ }), async () => {
1570
+ const page = await browser_session.get_current_page();
1571
+ if (!page?.evaluate) {
1572
+ throw new BrowserError('Unable to access page for scrolling.');
1573
+ }
1574
+ const success = await page.evaluate(({ text }) => {
1575
+ const iterator = document.createNodeIterator(document.body, NodeFilter.SHOW_ELEMENT);
1576
+ let node;
1577
+ while ((node = iterator.nextNode())) {
1578
+ const el = node;
1579
+ if (!el || !el.textContent)
1580
+ continue;
1581
+ if (el.textContent.toLowerCase().includes(text.toLowerCase())) {
1582
+ el.scrollIntoView({ behavior: 'smooth', block: 'center' });
1583
+ return true;
1584
+ }
726
1585
  }
1586
+ return false;
1587
+ }, { text: params.text });
1588
+ if (!success) {
1589
+ throw new BrowserError(`Text '${params.text}' not found on page`);
727
1590
  }
728
- return false;
729
- }, { text: params.text });
730
- if (!success) {
731
- throw new BrowserError(`Text '${params.text}' not found on page`);
732
- }
1591
+ return null;
1592
+ });
733
1593
  const msg = `🔍 Scrolled to text: ${params.text}`;
734
1594
  return new ActionResult({
735
1595
  extracted_content: msg,
@@ -737,18 +1597,49 @@ ${content}`;
737
1597
  long_term_memory: msg,
738
1598
  });
739
1599
  });
1600
+ this.registry.action('Scroll to text.', {
1601
+ param_model: ScrollToTextActionSchema,
1602
+ action_name: 'find_text',
1603
+ })(async function find_text(params, ctx) {
1604
+ try {
1605
+ return await registry.execute_action('scroll_to_text', params, ctx);
1606
+ }
1607
+ catch (error) {
1608
+ if (isAbortError(error)) {
1609
+ throw error;
1610
+ }
1611
+ const msg = `Text '${params.text}' not found or not visible on page`;
1612
+ return new ActionResult({
1613
+ extracted_content: msg,
1614
+ long_term_memory: `Tried scrolling to text '${params.text}' but it was not found`,
1615
+ });
1616
+ }
1617
+ });
740
1618
  }
741
1619
  registerFileSystemActions() {
742
- this.registry.action('Read file_name from file system', {
1620
+ const registry = this.registry;
1621
+ this.registry.action('Read the complete content of a file. Use this to view file contents before editing or to retrieve data from files. Supports text files (txt, md, json, csv, jsonl), documents (pdf, docx), and images (jpg, png).', {
743
1622
  param_model: ReadFileActionSchema,
744
1623
  })(async function read_file(params, { file_system, available_file_paths }) {
745
1624
  const fsInstance = file_system ?? new FileSystem(process.cwd(), false);
746
1625
  const allowed = Array.isArray(available_file_paths) &&
747
1626
  available_file_paths.includes(params.file_name);
748
- const result = await fsInstance.read_file(params.file_name, allowed);
1627
+ const structuredResult = typeof fsInstance.read_file_structured === 'function'
1628
+ ? await fsInstance.read_file_structured(params.file_name, allowed)
1629
+ : {
1630
+ message: await fsInstance.read_file(params.file_name, allowed),
1631
+ images: null,
1632
+ };
1633
+ const result = String(structuredResult?.message ?? '');
1634
+ const images = Array.isArray(structuredResult?.images)
1635
+ ? structuredResult.images
1636
+ : null;
749
1637
  const MAX_MEMORY_SIZE = 1000;
750
1638
  let memory = result;
751
- if (result.length > MAX_MEMORY_SIZE) {
1639
+ if (images && images.length > 0) {
1640
+ memory = `Read image file ${params.file_name}`;
1641
+ }
1642
+ else if (result.length > MAX_MEMORY_SIZE) {
752
1643
  const lines = result.split('\n');
753
1644
  let preview = '';
754
1645
  let used = 0;
@@ -764,12 +1655,373 @@ ${content}`;
764
1655
  }
765
1656
  return new ActionResult({
766
1657
  extracted_content: result,
767
- include_in_memory: true,
768
1658
  long_term_memory: memory,
1659
+ images,
769
1660
  include_extracted_content_only_once: true,
770
1661
  });
771
1662
  });
772
- this.registry.action('Write content to file', {
1663
+ this.registry.action('Intelligently read long content to find specific information. Works on current page (source="page") or files. For large content, uses search to identify relevant sections. Best for long articles, documents, or any content where you know what you are looking for.', { param_model: ReadLongContentActionSchema })(async function read_long_content(params, { browser_session, page_extraction_llm, available_file_paths, signal }) {
1664
+ throwIfAborted(signal);
1665
+ const goal = params.goal.trim();
1666
+ const source = (params.source || 'page').trim();
1667
+ const context = (params.context || '').trim();
1668
+ const maxChars = 50000;
1669
+ const chunkSize = 2000;
1670
+ const fallbackSearchTerms = (() => {
1671
+ const tokens = `${goal} ${context}`
1672
+ .toLowerCase()
1673
+ .match(/[a-z0-9][a-z0-9-]{2,}/g);
1674
+ if (!tokens?.length) {
1675
+ return goal ? [goal] : ['content'];
1676
+ }
1677
+ return Array.from(new Set(tokens)).slice(0, 5);
1678
+ })();
1679
+ const extractSearchTerms = async () => {
1680
+ const extractionLlm = page_extraction_llm;
1681
+ if (!extractionLlm || typeof extractionLlm.ainvoke !== 'function') {
1682
+ return fallbackSearchTerms;
1683
+ }
1684
+ const prompt = `Extract 3-5 key search terms from this goal that would help find relevant sections.
1685
+ Return only the terms, one per line, no numbering or bullets.
1686
+
1687
+ Goal: ${goal}
1688
+
1689
+ Context: ${context}`;
1690
+ try {
1691
+ const response = await runWithTimeoutAndSignal(async () => (await extractionLlm.ainvoke([new UserMessage(prompt)], undefined, { signal: signal ?? undefined })), 12000, signal, 'Timed out extracting search terms');
1692
+ const parsed = (response?.completion ?? '')
1693
+ .split('\n')
1694
+ .map((line) => line
1695
+ .trim()
1696
+ .replace(/^[-\d.)\s]+/, '')
1697
+ .trim())
1698
+ .filter(Boolean);
1699
+ const unique = Array.from(new Set(parsed)).slice(0, 5);
1700
+ return unique.length ? unique : fallbackSearchTerms;
1701
+ }
1702
+ catch (error) {
1703
+ if (isAbortError(error)) {
1704
+ throw error;
1705
+ }
1706
+ return fallbackSearchTerms;
1707
+ }
1708
+ };
1709
+ const escapeRegExp = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1710
+ const searchText = (value, pattern, contextChars = 100) => {
1711
+ let regex;
1712
+ try {
1713
+ regex = new RegExp(pattern, 'gi');
1714
+ }
1715
+ catch {
1716
+ regex = new RegExp(escapeRegExp(pattern), 'gi');
1717
+ }
1718
+ const matches = [];
1719
+ let match = regex.exec(value);
1720
+ while (match != null) {
1721
+ const start = Math.max(0, match.index - contextChars);
1722
+ const end = Math.min(value.length, match.index + match[0].length + contextChars);
1723
+ matches.push({
1724
+ position: match.index,
1725
+ snippet: value.slice(start, end),
1726
+ });
1727
+ if (!regex.global) {
1728
+ break;
1729
+ }
1730
+ match = regex.exec(value);
1731
+ }
1732
+ return matches;
1733
+ };
1734
+ const chunkContent = (value, size = chunkSize) => {
1735
+ const chunks = [];
1736
+ for (let start = 0; start < value.length; start += size) {
1737
+ chunks.push({
1738
+ start,
1739
+ end: Math.min(start + size, value.length),
1740
+ text: value.slice(start, start + size),
1741
+ });
1742
+ }
1743
+ return chunks;
1744
+ };
1745
+ const getCurrentPageUrl = (page) => {
1746
+ const value = page?.url;
1747
+ if (typeof value === 'function') {
1748
+ return String(value.call(page) ?? '');
1749
+ }
1750
+ return typeof value === 'string' ? value : '';
1751
+ };
1752
+ const readPdfByPage = async (filePath) => {
1753
+ const buffer = await fsp.readFile(filePath);
1754
+ try {
1755
+ const pdfParseModule = (await import('pdf-parse'));
1756
+ if (typeof pdfParseModule.PDFParse === 'function') {
1757
+ const Parser = pdfParseModule.PDFParse;
1758
+ const parser = new Parser({ data: buffer });
1759
+ try {
1760
+ let numPages = 0;
1761
+ try {
1762
+ const info = await parser.getInfo?.({ parsePageInfo: false });
1763
+ numPages = Number(info?.total ?? 0);
1764
+ }
1765
+ catch {
1766
+ numPages = 0;
1767
+ }
1768
+ if (!Number.isFinite(numPages) || numPages <= 0) {
1769
+ const full = await parser.getText();
1770
+ const text = typeof full?.text === 'string' ? full.text : '';
1771
+ return {
1772
+ numPages: 1,
1773
+ pageTexts: [text],
1774
+ totalChars: text.length,
1775
+ };
1776
+ }
1777
+ const pageTexts = [];
1778
+ let totalChars = 0;
1779
+ for (let pageNumber = 1; pageNumber <= numPages; pageNumber += 1) {
1780
+ const pageResult = await parser.getText({
1781
+ partial: [pageNumber],
1782
+ });
1783
+ const text = typeof pageResult?.text === 'string' ? pageResult.text : '';
1784
+ pageTexts.push(text);
1785
+ totalChars += text.length;
1786
+ }
1787
+ return {
1788
+ numPages,
1789
+ pageTexts,
1790
+ totalChars,
1791
+ };
1792
+ }
1793
+ finally {
1794
+ if (typeof parser.destroy === 'function') {
1795
+ await parser.destroy();
1796
+ }
1797
+ }
1798
+ }
1799
+ }
1800
+ catch {
1801
+ // Fall back to the compatibility parser.
1802
+ }
1803
+ const parsed = await extractPdfText(buffer);
1804
+ const text = parsed.text ?? '';
1805
+ return {
1806
+ numPages: Math.max(parsed.totalPages, 1),
1807
+ pageTexts: [text],
1808
+ totalChars: text.length,
1809
+ };
1810
+ };
1811
+ try {
1812
+ let content = '';
1813
+ let sourceName = 'content';
1814
+ if (source.toLowerCase() === 'page') {
1815
+ if (!browser_session) {
1816
+ throw new BrowserError('Browser session missing for page content.');
1817
+ }
1818
+ const page = await browser_session.get_current_page();
1819
+ if (!page?.content) {
1820
+ throw new BrowserError('No active page available to read content.');
1821
+ }
1822
+ const html = await page.content();
1823
+ const pageUrl = getCurrentPageUrl(page);
1824
+ const markdown = extractCleanMarkdownFromHtml(html || '', {
1825
+ extract_links: false,
1826
+ method: 'page_content',
1827
+ url: pageUrl || undefined,
1828
+ });
1829
+ content = markdown.content;
1830
+ sourceName = 'current page';
1831
+ if (!content) {
1832
+ return new ActionResult({
1833
+ extracted_content: 'Error: No page content available',
1834
+ long_term_memory: 'Failed to read page: no content',
1835
+ });
1836
+ }
1837
+ }
1838
+ else {
1839
+ const allowedPaths = new Set(Array.isArray(available_file_paths) ? available_file_paths : []);
1840
+ const downloadedFiles = Array.isArray(browser_session?.downloaded_files)
1841
+ ? browser_session.downloaded_files
1842
+ : [];
1843
+ for (const filePath of downloadedFiles) {
1844
+ allowedPaths.add(filePath);
1845
+ }
1846
+ if (!allowedPaths.has(source)) {
1847
+ const message = `Error: File path not in available_file_paths: ${source}. ` +
1848
+ 'The user must add this path to available_file_paths when creating the Agent.';
1849
+ return new ActionResult({
1850
+ extracted_content: message,
1851
+ long_term_memory: `Failed to read: file path not allowed: ${source}`,
1852
+ });
1853
+ }
1854
+ if (!fs.existsSync(source)) {
1855
+ return new ActionResult({
1856
+ extracted_content: `Error: File not found: ${source}`,
1857
+ long_term_memory: 'Failed to read: file not found',
1858
+ });
1859
+ }
1860
+ const ext = path.extname(source).toLowerCase();
1861
+ sourceName = path.basename(source);
1862
+ if (ext === '.pdf') {
1863
+ const pdfData = await readPdfByPage(source);
1864
+ const numPages = pdfData.numPages;
1865
+ const pageTexts = pdfData.pageTexts;
1866
+ const totalChars = pdfData.totalChars;
1867
+ if (totalChars <= maxChars) {
1868
+ const contentParts = [];
1869
+ for (let pageIndex = 0; pageIndex < pageTexts.length; pageIndex += 1) {
1870
+ const pageText = pageTexts[pageIndex] ?? '';
1871
+ if (!pageText.trim()) {
1872
+ continue;
1873
+ }
1874
+ contentParts.push(`--- Page ${pageIndex + 1} ---\n${pageText}`);
1875
+ }
1876
+ const allContent = contentParts.join('\n\n');
1877
+ return new ActionResult({
1878
+ extracted_content: `PDF: ${sourceName} (${numPages} pages)\n\n${allContent}`,
1879
+ long_term_memory: `Read ${sourceName} (${numPages} pages, ${totalChars.toLocaleString()} chars) for goal: ${goal.slice(0, 50)}`,
1880
+ include_extracted_content_only_once: true,
1881
+ });
1882
+ }
1883
+ const searchTerms = await extractSearchTerms();
1884
+ const pageScores = new Map();
1885
+ for (const term of searchTerms) {
1886
+ if (!term.trim()) {
1887
+ continue;
1888
+ }
1889
+ const pattern = new RegExp(escapeRegExp(term), 'i');
1890
+ for (let pageIndex = 0; pageIndex < pageTexts.length; pageIndex += 1) {
1891
+ const pageText = pageTexts[pageIndex] ?? '';
1892
+ if (pattern.test(pageText)) {
1893
+ const pageNumber = pageIndex + 1;
1894
+ pageScores.set(pageNumber, (pageScores.get(pageNumber) ?? 0) + 1);
1895
+ }
1896
+ }
1897
+ }
1898
+ const pagesToRead = [1];
1899
+ const sortedPages = Array.from(pageScores.entries()).sort((a, b) => b[1] - a[1]);
1900
+ for (const [pageNumber] of sortedPages) {
1901
+ if (!pagesToRead.includes(pageNumber)) {
1902
+ pagesToRead.push(pageNumber);
1903
+ }
1904
+ }
1905
+ const contentParts = [];
1906
+ let charsUsed = 0;
1907
+ const pagesIncluded = [];
1908
+ const pageOrder = Array.from(new Set(pagesToRead)).sort((a, b) => a - b);
1909
+ for (const pageNumber of pageOrder) {
1910
+ const pageText = pageTexts[pageNumber - 1] ?? '';
1911
+ const pageHeader = `--- Page ${pageNumber} ---\n`;
1912
+ const remaining = maxChars - charsUsed;
1913
+ if (remaining < pageHeader.length + 50) {
1914
+ break;
1915
+ }
1916
+ let pageContent = `${pageHeader}${pageText}`;
1917
+ if (pageContent.length > remaining) {
1918
+ const truncationSuffix = '\n[...truncated]';
1919
+ pageContent =
1920
+ pageContent.slice(0, remaining - truncationSuffix.length) +
1921
+ truncationSuffix;
1922
+ }
1923
+ contentParts.push(pageContent);
1924
+ charsUsed += pageContent.length;
1925
+ pagesIncluded.push(pageNumber);
1926
+ }
1927
+ const partialPdfContent = contentParts.join('\n\n');
1928
+ return new ActionResult({
1929
+ extracted_content: `PDF: ${sourceName} (${numPages} pages, showing ${pagesIncluded.length} relevant)\n\n` +
1930
+ partialPdfContent,
1931
+ long_term_memory: `Read ${sourceName} (${pagesIncluded.length} relevant pages of ${numPages}) ` +
1932
+ `for goal: ${goal.slice(0, 50)}`,
1933
+ include_extracted_content_only_once: true,
1934
+ });
1935
+ }
1936
+ const fileBuffer = await fsp.readFile(source);
1937
+ content = fileBuffer.toString('utf-8');
1938
+ }
1939
+ if (!content.trim()) {
1940
+ return new ActionResult({
1941
+ extracted_content: `Error: No readable content found in ${sourceName}`,
1942
+ long_term_memory: `Failed to read ${sourceName}: no content`,
1943
+ });
1944
+ }
1945
+ if (content.length <= maxChars) {
1946
+ return new ActionResult({
1947
+ extracted_content: `Content from ${sourceName} (${content.length.toLocaleString()} chars):\n\n${content}`,
1948
+ long_term_memory: `Read ${sourceName} (${content.length.toLocaleString()} chars) for goal: ${goal.slice(0, 50)}`,
1949
+ include_extracted_content_only_once: true,
1950
+ });
1951
+ }
1952
+ const searchTerms = await extractSearchTerms();
1953
+ const chunks = chunkContent(content, chunkSize);
1954
+ const chunkScores = new Map();
1955
+ for (const term of searchTerms) {
1956
+ const matches = searchText(content, term);
1957
+ for (const match of matches) {
1958
+ for (let index = 0; index < chunks.length; index += 1) {
1959
+ const chunk = chunks[index];
1960
+ if (chunk &&
1961
+ chunk.start <= match.position &&
1962
+ match.position < chunk.end) {
1963
+ chunkScores.set(index, (chunkScores.get(index) ?? 0) + 1);
1964
+ break;
1965
+ }
1966
+ }
1967
+ }
1968
+ }
1969
+ if (!chunkScores.size) {
1970
+ const truncated = content.slice(0, maxChars);
1971
+ return new ActionResult({
1972
+ extracted_content: `Content from ${sourceName} (first ${maxChars.toLocaleString()} of ${content.length.toLocaleString()} chars):\n\n${truncated}`,
1973
+ long_term_memory: `Read ${sourceName} (truncated to ${maxChars.toLocaleString()} chars, no matches for search terms)`,
1974
+ include_extracted_content_only_once: true,
1975
+ });
1976
+ }
1977
+ const sortedChunks = Array.from(chunkScores.entries()).sort((a, b) => b[1] - a[1]);
1978
+ const selectedIndices = new Set([0]);
1979
+ for (const [chunkIndex] of sortedChunks) {
1980
+ selectedIndices.add(chunkIndex);
1981
+ }
1982
+ const resultParts = [];
1983
+ let totalChars = 0;
1984
+ const orderedIndices = Array.from(selectedIndices).sort((a, b) => a - b);
1985
+ for (const index of orderedIndices) {
1986
+ const chunk = chunks[index];
1987
+ if (!chunk) {
1988
+ continue;
1989
+ }
1990
+ if (totalChars + chunk.text.length > maxChars) {
1991
+ break;
1992
+ }
1993
+ const previousIndex = index - 1;
1994
+ if (index > 0 && !selectedIndices.has(previousIndex)) {
1995
+ resultParts.push('\n[...]\n');
1996
+ }
1997
+ resultParts.push(chunk.text);
1998
+ totalChars += chunk.text.length;
1999
+ }
2000
+ const resultContent = resultParts.join('');
2001
+ return new ActionResult({
2002
+ extracted_content: `Content from ${sourceName} (relevant sections, ${totalChars.toLocaleString()} of ${content.length.toLocaleString()} chars):\n\n` +
2003
+ resultContent,
2004
+ long_term_memory: `Read ${sourceName} (${selectedIndices.size} relevant sections of ${chunks.length}) ` +
2005
+ `for goal: ${goal.slice(0, 50)}`,
2006
+ include_extracted_content_only_once: true,
2007
+ });
2008
+ }
2009
+ catch (error) {
2010
+ if (isAbortError(error)) {
2011
+ throw error;
2012
+ }
2013
+ const errorMessage = `Error reading content: ${String(error.message ?? error)}`;
2014
+ return new ActionResult({
2015
+ extracted_content: errorMessage,
2016
+ long_term_memory: errorMessage,
2017
+ });
2018
+ }
2019
+ });
2020
+ this.registry.action('Write content to a file. By default this OVERWRITES the entire file - use append=true to add to an existing file, or use replace_file for targeted edits within a file. ' +
2021
+ 'FILENAME RULES: Use only letters, numbers, underscores, hyphens, dots, parentheses. Spaces are auto-converted to hyphens. ' +
2022
+ 'SUPPORTED EXTENSIONS: .txt, .md, .json, .jsonl, .csv, .html, .xml, .pdf, .docx. ' +
2023
+ 'CANNOT write binary/image files (.png, .jpg, .mp4, etc.) - do not attempt to save screenshots as files. ' +
2024
+ 'For PDF files, write content in markdown format and it will be auto-converted to PDF.', {
773
2025
  param_model: WriteFileActionSchema,
774
2026
  })(async function write_file(params, { file_system }) {
775
2027
  const fsInstance = file_system ?? new FileSystem(process.cwd(), false);
@@ -786,49 +2038,168 @@ ${content}`;
786
2038
  const result = append
787
2039
  ? await fsInstance.append_file(params.file_name, content)
788
2040
  : await fsInstance.write_file(params.file_name, content);
789
- const msg = `📝 ${result}`;
790
2041
  return new ActionResult({
791
2042
  extracted_content: result,
792
- include_in_memory: true,
793
2043
  long_term_memory: result,
794
2044
  });
795
2045
  });
796
- this.registry.action('Replace text within an existing file', {
2046
+ this.registry.action('Replace specific text within a file by searching for old_str and replacing with new_str. Use this for targeted edits like updating todo checkboxes or modifying specific lines without rewriting the entire file.', {
797
2047
  param_model: ReplaceFileStrActionSchema,
798
2048
  })(async function replace_file_str(params, { file_system }) {
799
2049
  const fsInstance = file_system ?? new FileSystem(process.cwd(), false);
800
2050
  const result = await fsInstance.replace_file_str(params.file_name, params.old_str, params.new_str);
801
2051
  return new ActionResult({
802
2052
  extracted_content: result,
803
- include_in_memory: true,
804
2053
  long_term_memory: result,
805
2054
  });
806
2055
  });
2056
+ this.registry.action('Replace specific text within a file by searching for old_str and replacing with new_str. Use this for targeted edits like updating todo checkboxes or modifying specific lines without rewriting the entire file.', {
2057
+ param_model: ReplaceFileStrActionSchema,
2058
+ action_name: 'replace_file',
2059
+ })(async function replace_file(params, ctx) {
2060
+ return registry.execute_action('replace_file_str', params, ctx);
2061
+ });
807
2062
  }
808
- registerKeyboardActions() {
809
- this.registry.action('Send keys to the active page', {
810
- param_model: SendKeysActionSchema,
811
- })(async function send_keys(params, { browser_session }) {
2063
+ registerUtilityActions() {
2064
+ this.registry.action('Take a screenshot of the current viewport. If file_name is provided, saves to that file and returns the path. Otherwise, screenshot is included in the next browser_state observation.', { param_model: ScreenshotActionSchema })(async function screenshot(params, { browser_session, file_system, signal }) {
812
2065
  if (!browser_session)
813
2066
  throw new Error('Browser session missing');
814
- const page = await browser_session.get_current_page();
815
- const keyboard = page?.keyboard;
816
- if (!keyboard) {
817
- throw new BrowserError('Keyboard input is not available on the current page.');
2067
+ throwIfAborted(signal);
2068
+ if (params.file_name) {
2069
+ const screenshotB64 = await dispatchBrowserEventIfAvailable(browser_session, new ScreenshotEvent({
2070
+ full_page: false,
2071
+ }), async () => (await browser_session.take_screenshot?.(false)) ?? null);
2072
+ if (!screenshotB64) {
2073
+ return new ActionResult({
2074
+ error: 'Failed to capture screenshot.',
2075
+ });
2076
+ }
2077
+ const fsInstance = file_system ?? new FileSystem(process.cwd(), false);
2078
+ let fileName = params.file_name;
2079
+ if (!fileName.toLowerCase().endsWith('.png')) {
2080
+ fileName = `${fileName}.png`;
2081
+ }
2082
+ fileName = FileSystem.sanitize_filename(fileName);
2083
+ const filePath = path.join(fsInstance.get_dir(), fileName);
2084
+ await fsp.writeFile(filePath, Buffer.from(screenshotB64, 'base64'));
2085
+ const msg = `📸 Saved screenshot to ${filePath}`;
2086
+ return new ActionResult({
2087
+ extracted_content: msg,
2088
+ long_term_memory: msg,
2089
+ attachments: [filePath],
2090
+ });
818
2091
  }
819
- try {
820
- await keyboard.press(params.keys);
2092
+ return new ActionResult({
2093
+ extracted_content: 'Requested screenshot for next observation',
2094
+ metadata: {
2095
+ include_screenshot: true,
2096
+ },
2097
+ });
2098
+ });
2099
+ this.registry.action('Execute browser JavaScript on the current page and return the result.', { param_model: EvaluateActionSchema })(async function evaluate(params, { browser_session, signal }) {
2100
+ if (!browser_session)
2101
+ throw new Error('Browser session missing');
2102
+ throwIfAborted(signal);
2103
+ const page = await browser_session.get_current_page();
2104
+ if (!page?.evaluate) {
2105
+ throw new BrowserError('No active page available for evaluate.');
821
2106
  }
822
- catch (error) {
823
- if (error instanceof Error && error.message.includes('Unknown key')) {
824
- for (const char of params.keys) {
825
- await keyboard.press(char);
2107
+ const validatedCode = validateAndFixJavaScript(params.code);
2108
+ const payload = (await page.evaluate(async ({ code }) => {
2109
+ const serialize = (value) => {
2110
+ if (value === undefined) {
2111
+ return null;
2112
+ }
2113
+ try {
2114
+ return JSON.parse(JSON.stringify(value));
826
2115
  }
2116
+ catch {
2117
+ return String(value);
2118
+ }
2119
+ };
2120
+ try {
2121
+ const raw = await Promise.resolve((0, eval)(code));
2122
+ return { ok: true, result: serialize(raw) };
827
2123
  }
828
- else {
829
- throw error;
2124
+ catch (error) {
2125
+ return {
2126
+ ok: false,
2127
+ error: error instanceof Error
2128
+ ? error.message
2129
+ : String(error ?? 'Unknown evaluate error'),
2130
+ };
2131
+ }
2132
+ }, { code: validatedCode }));
2133
+ if (!payload) {
2134
+ return new ActionResult({ error: 'evaluate returned no result' });
2135
+ }
2136
+ if (!payload.ok) {
2137
+ const codePreview = validatedCode.length > 500
2138
+ ? `${validatedCode.slice(0, 500)}...`
2139
+ : validatedCode;
2140
+ return new ActionResult({
2141
+ error: `JavaScript Execution Failed:\n` +
2142
+ `JavaScript execution error: ${payload.error ?? 'Unknown error'}\n\n` +
2143
+ `Validated Code (after quote fixing):\n${codePreview}`,
2144
+ });
2145
+ }
2146
+ let rendered = typeof payload.result === 'string'
2147
+ ? payload.result
2148
+ : JSON.stringify(payload.result);
2149
+ const imagePattern = /(data:image\/[^;]+;base64,[A-Za-z0-9+/=]+)/g;
2150
+ const foundImages = rendered.match(imagePattern) ?? [];
2151
+ let metadata = null;
2152
+ if (foundImages.length > 0) {
2153
+ metadata = { images: foundImages };
2154
+ for (const imageData of foundImages) {
2155
+ rendered = rendered.split(imageData).join('[Image]');
830
2156
  }
831
2157
  }
2158
+ const maxChars = 20000;
2159
+ if (rendered.length > maxChars) {
2160
+ rendered = `${rendered.slice(0, maxChars - 50)}\n... [Truncated after 20000 characters]`;
2161
+ }
2162
+ const maxMemoryChars = 10000;
2163
+ const includeExtractedContentOnlyOnce = rendered.length >= maxMemoryChars;
2164
+ const longTermMemory = includeExtractedContentOnlyOnce
2165
+ ? `JavaScript executed successfully, result length: ${rendered.length} characters.`
2166
+ : rendered;
2167
+ return new ActionResult({
2168
+ extracted_content: rendered,
2169
+ long_term_memory: longTermMemory,
2170
+ include_extracted_content_only_once: includeExtractedContentOnlyOnce,
2171
+ metadata,
2172
+ });
2173
+ });
2174
+ }
2175
+ registerKeyboardActions() {
2176
+ this.registry.action('Send keys to the active page', {
2177
+ param_model: SendKeysActionSchema,
2178
+ })(async function send_keys(params, { browser_session }) {
2179
+ if (!browser_session)
2180
+ throw new Error('Browser session missing');
2181
+ await dispatchBrowserEventIfAvailable(browser_session, new SendKeysEvent({ keys: params.keys }), async () => {
2182
+ const page = await browser_session.get_current_page();
2183
+ const keyboard = page?.keyboard;
2184
+ if (!keyboard) {
2185
+ throw new BrowserError('Keyboard input is not available on the current page.');
2186
+ }
2187
+ try {
2188
+ await keyboard.press(params.keys);
2189
+ }
2190
+ catch (error) {
2191
+ if (error instanceof Error &&
2192
+ error.message.includes('Unknown key')) {
2193
+ for (const char of params.keys) {
2194
+ await keyboard.press(char);
2195
+ }
2196
+ }
2197
+ else {
2198
+ throw error;
2199
+ }
2200
+ }
2201
+ return null;
2202
+ });
832
2203
  const msg = `⌨️ Sent keys: ${params.keys}`;
833
2204
  return new ActionResult({
834
2205
  extracted_content: msg,
@@ -838,15 +2209,42 @@ ${content}`;
838
2209
  });
839
2210
  }
840
2211
  registerDropdownActions() {
2212
+ const registry = this.registry;
2213
+ const dropdownLogger = this.logger;
2214
+ const formatAvailableOptions = (options) => options
2215
+ .map((opt) => ` - [${opt.index}] text=${JSON.stringify(opt.text)} value=${JSON.stringify(opt.value)}`)
2216
+ .join('\n');
841
2217
  this.registry.action('Get all options from a native dropdown or ARIA menu', { param_model: DropdownOptionsActionSchema })(async function get_dropdown_options(params, { browser_session, signal }) {
842
2218
  if (!browser_session)
843
2219
  throw new Error('Browser session missing');
844
2220
  throwIfAborted(signal);
845
- const page = await browser_session.get_current_page();
846
2221
  const domElement = await browser_session.get_dom_element_by_index(params.index, { signal });
847
2222
  if (!domElement) {
848
- throw new BrowserError(`Element index ${params.index} does not exist.`);
2223
+ const msg = `Element index ${params.index} not available - page may have changed. Try refreshing browser state.`;
2224
+ dropdownLogger.warning(`⚠️ ${msg}`);
2225
+ return new ActionResult({
2226
+ extracted_content: msg,
2227
+ });
2228
+ }
2229
+ if (typeof browser_session.dispatch_browser_event === 'function') {
2230
+ const dispatchResult = await browser_session.dispatch_browser_event(new GetDropdownOptionsEvent({ node: domElement }));
2231
+ const eventResult = dispatchResult?.event?.event_result;
2232
+ const eventMessage = eventResult?.message ??
2233
+ eventResult?.short_term_memory ??
2234
+ eventResult?.formatted_options ??
2235
+ null;
2236
+ if (eventMessage) {
2237
+ const memory = eventResult?.long_term_memory ??
2238
+ `Found dropdown options for index ${params.index}.`;
2239
+ return new ActionResult({
2240
+ extracted_content: eventMessage,
2241
+ include_in_memory: true,
2242
+ include_extracted_content_only_once: true,
2243
+ long_term_memory: memory,
2244
+ });
2245
+ }
849
2246
  }
2247
+ const page = await browser_session.get_current_page();
850
2248
  if (!page?.evaluate) {
851
2249
  throw new BrowserError('Unable to evaluate dropdown options on current page.');
852
2250
  }
@@ -859,8 +2257,8 @@ ${content}`;
859
2257
  return null;
860
2258
  if (element.tagName?.toLowerCase() === 'select') {
861
2259
  const options = Array.from(element.options).map((opt, index) => ({
862
- text: opt.text,
863
- value: opt.value,
2260
+ text: opt.textContent?.trim() ?? '',
2261
+ value: (opt.value ?? '').trim(),
864
2262
  index,
865
2263
  }));
866
2264
  return { type: 'select', options };
@@ -881,8 +2279,8 @@ ${content}`;
881
2279
  if (!payload || !payload.options?.length) {
882
2280
  throw new BrowserError('No options found for the specified dropdown.');
883
2281
  }
884
- const formatted = payload.options.map((opt) => `${opt.index}: text=${JSON.stringify(opt.text ?? '')}`);
885
- formatted.push('Use the exact text string in select_dropdown_option');
2282
+ const formatted = payload.options.map((opt) => `${opt.index}: text=${JSON.stringify(opt.text ?? '')}, value=${JSON.stringify(opt.value ?? '')}`);
2283
+ formatted.push('Prefer exact text first; if needed select_dropdown_option also supports case-insensitive text/value matching.');
886
2284
  const message = formatted.join('\n');
887
2285
  return new ActionResult({
888
2286
  extracted_content: message,
@@ -891,17 +2289,49 @@ ${content}`;
891
2289
  long_term_memory: `Found dropdown options for index ${params.index}.`,
892
2290
  });
893
2291
  });
2292
+ this.registry.action('Get all options from a native dropdown or ARIA menu', {
2293
+ param_model: DropdownOptionsActionSchema,
2294
+ action_name: 'dropdown_options',
2295
+ })(async function dropdown_options(params, ctx) {
2296
+ return registry.execute_action('get_dropdown_options', params, ctx);
2297
+ });
894
2298
  this.registry.action('Select dropdown option or ARIA menu item by text', {
895
2299
  param_model: SelectDropdownActionSchema,
896
2300
  })(async function select_dropdown_option(params, { browser_session, signal }) {
897
2301
  if (!browser_session)
898
2302
  throw new Error('Browser session missing');
899
2303
  throwIfAborted(signal);
900
- const page = await browser_session.get_current_page();
901
2304
  const domElement = await browser_session.get_dom_element_by_index(params.index, { signal });
2305
+ if (!domElement) {
2306
+ const msg = `Element index ${params.index} not available - page may have changed. Try refreshing browser state.`;
2307
+ dropdownLogger.warning(`⚠️ ${msg}`);
2308
+ return new ActionResult({
2309
+ extracted_content: msg,
2310
+ });
2311
+ }
902
2312
  if (!domElement?.xpath) {
903
2313
  throw new BrowserError('DOM element does not include an XPath selector.');
904
2314
  }
2315
+ if (typeof browser_session.dispatch_browser_event === 'function') {
2316
+ const dispatchResult = await browser_session.dispatch_browser_event(new SelectDropdownOptionEvent({
2317
+ node: domElement,
2318
+ text: params.text,
2319
+ }));
2320
+ const eventResult = dispatchResult?.event?.event_result;
2321
+ const eventMessage = eventResult?.message ??
2322
+ eventResult?.short_term_memory ??
2323
+ eventResult?.matched_text ??
2324
+ null;
2325
+ if (eventMessage) {
2326
+ const memory = eventResult?.long_term_memory ?? eventMessage;
2327
+ return new ActionResult({
2328
+ extracted_content: eventMessage,
2329
+ include_in_memory: true,
2330
+ long_term_memory: memory,
2331
+ });
2332
+ }
2333
+ }
2334
+ const page = await browser_session.get_current_page();
905
2335
  if (!page) {
906
2336
  throw new BrowserError('No active page for selection.');
907
2337
  }
@@ -922,45 +2352,121 @@ ${content}`;
922
2352
  if (!typeInfo?.found)
923
2353
  continue;
924
2354
  if (typeInfo.type === 'select') {
925
- await frame
926
- .locator(domElement.xpath)
927
- .first()
928
- .select_option({ label: params.text });
929
- const msg = `Selected option ${params.text}`;
930
- return new ActionResult({
931
- extracted_content: msg,
932
- include_in_memory: true,
933
- long_term_memory: msg,
934
- });
2355
+ const selection = await frame.evaluate(({ xpath, text }) => {
2356
+ const root = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
2357
+ if (!root || root.tagName?.toLowerCase() !== 'select') {
2358
+ return { found: false };
2359
+ }
2360
+ const options = Array.from(root.options).map((opt, index) => ({
2361
+ index,
2362
+ text: opt.textContent?.trim() ?? '',
2363
+ value: (opt.value ?? '').trim(),
2364
+ }));
2365
+ const normalize = (value) => value.trim().toLowerCase();
2366
+ const targetRaw = text.trim();
2367
+ const targetLower = normalize(text);
2368
+ let matchedIndex = options.findIndex((opt) => opt.text === targetRaw || opt.value === targetRaw);
2369
+ if (matchedIndex < 0) {
2370
+ matchedIndex = options.findIndex((opt) => normalize(opt.text) === targetLower ||
2371
+ normalize(opt.value) === targetLower);
2372
+ }
2373
+ if (matchedIndex < 0) {
2374
+ return { found: true, success: false, options };
2375
+ }
2376
+ const matched = options[matchedIndex];
2377
+ root.value = matched.value;
2378
+ root.dispatchEvent(new Event('input', { bubbles: true }));
2379
+ root.dispatchEvent(new Event('change', { bubbles: true }));
2380
+ const selectedOption = root.selectedIndex >= 0
2381
+ ? root.options[root.selectedIndex]
2382
+ : null;
2383
+ const selectedText = selectedOption?.textContent?.trim() ?? '';
2384
+ const selectedValue = (root.value ?? '').trim();
2385
+ const verified = normalize(selectedValue) === normalize(matched.value) ||
2386
+ normalize(selectedText) === normalize(matched.text);
2387
+ return {
2388
+ found: true,
2389
+ success: verified,
2390
+ options,
2391
+ selectedText,
2392
+ selectedValue,
2393
+ matched,
2394
+ };
2395
+ }, { xpath: domElement.xpath, text: params.text });
2396
+ if (selection?.found && selection.success) {
2397
+ const matchedText = selection.matched?.text ?? params.text;
2398
+ const matchedValue = selection.matched?.value ?? '';
2399
+ const msg = `Selected option ${matchedText} (${matchedValue})`;
2400
+ return new ActionResult({
2401
+ extracted_content: msg,
2402
+ include_in_memory: true,
2403
+ long_term_memory: msg,
2404
+ });
2405
+ }
2406
+ if (selection?.found) {
2407
+ const details = formatAvailableOptions(selection.options ?? []);
2408
+ throw new BrowserError(`Could not select option '${params.text}' for index ${params.index}.\nAvailable options:\n${details}`);
2409
+ }
2410
+ continue;
935
2411
  }
936
2412
  const clicked = await frame.evaluate(({ xpath, text }) => {
937
2413
  const root = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
938
2414
  if (!root)
939
2415
  return false;
940
2416
  const nodes = root.querySelectorAll('[role="menuitem"],[role="option"]');
941
- for (const node of Array.from(nodes)) {
942
- if (node.textContent?.trim() === text) {
943
- node.click();
944
- return true;
945
- }
2417
+ const options = Array.from(nodes).map((node, index) => ({
2418
+ index,
2419
+ text: node.textContent?.trim() ?? '',
2420
+ value: node.textContent?.trim() ?? '',
2421
+ }));
2422
+ const normalize = (value) => value.trim().toLowerCase();
2423
+ const targetRaw = text.trim();
2424
+ const targetLower = normalize(text);
2425
+ let matchedIndex = options.findIndex((opt) => opt.text === targetRaw || opt.value === targetRaw);
2426
+ if (matchedIndex < 0) {
2427
+ matchedIndex = options.findIndex((opt) => normalize(opt.text) === targetLower ||
2428
+ normalize(opt.value) === targetLower);
2429
+ }
2430
+ if (matchedIndex < 0) {
2431
+ return { found: true, success: false, options };
946
2432
  }
947
- return false;
2433
+ nodes[matchedIndex].click();
2434
+ return {
2435
+ found: true,
2436
+ success: true,
2437
+ options,
2438
+ matched: options[matchedIndex],
2439
+ };
948
2440
  }, { xpath: domElement.xpath, text: params.text });
949
- if (clicked) {
950
- const msg = `Selected menu item ${params.text}`;
2441
+ if (clicked?.found && clicked.success) {
2442
+ const matchedText = clicked.matched?.text ?? params.text;
2443
+ const msg = `Selected menu item ${matchedText}`;
951
2444
  return new ActionResult({
952
2445
  extracted_content: msg,
953
2446
  include_in_memory: true,
954
2447
  long_term_memory: msg,
955
2448
  });
956
2449
  }
2450
+ if (clicked?.found) {
2451
+ const details = formatAvailableOptions(clicked.options ?? []);
2452
+ throw new BrowserError(`Could not select option '${params.text}' for index ${params.index}.\nAvailable options:\n${details}`);
2453
+ }
957
2454
  }
958
2455
  catch (error) {
2456
+ if (error instanceof BrowserError) {
2457
+ throw error;
2458
+ }
959
2459
  continue;
960
2460
  }
961
2461
  }
962
2462
  throw new BrowserError(`Could not select option '${params.text}' for index ${params.index}`);
963
2463
  });
2464
+ this.registry.action('Select dropdown option or ARIA menu item by text', {
2465
+ param_model: SelectDropdownActionSchema,
2466
+ action_name: 'select_dropdown',
2467
+ })(async function select_dropdown(params, ctx) {
2468
+ return registry.execute_action('select_dropdown_option', params, ctx);
2469
+ });
964
2470
  }
965
2471
  registerSheetsActions() {
966
2472
  const gotoSheetsRange = this.gotoSheetsRange.bind(this);
@@ -1090,13 +2596,11 @@ ${content}`;
1090
2596
  if (outputModel) {
1091
2597
  const structuredSchema = StructuredOutputActionSchema(outputModel);
1092
2598
  this.registry.action('Complete task - with return text and success flag.', { param_model: structuredSchema })(async function done(params) {
1093
- const payload = { ...params.data };
1094
- for (const key of Object.keys(payload)) {
1095
- const value = payload[key];
1096
- if (value && typeof value === 'object' && 'value' in value) {
1097
- payload[key] = value.value;
1098
- }
1099
- }
2599
+ const payload = params.data &&
2600
+ typeof params.data === 'object' &&
2601
+ !Array.isArray(params.data)
2602
+ ? params.data
2603
+ : {};
1100
2604
  return new ActionResult({
1101
2605
  is_done: true,
1102
2606
  success: params.success,
@@ -1121,9 +2625,6 @@ ${content}`;
1121
2625
  if (displayFilesInDoneText) {
1122
2626
  let attachmentText = '';
1123
2627
  for (const fileName of params.files_to_display) {
1124
- if (fileName === 'todo.md') {
1125
- continue;
1126
- }
1127
2628
  const content = fsInstance.display_file(fileName);
1128
2629
  if (content) {
1129
2630
  attachmentText += `\n\n${fileName}:\n${content}`;
@@ -1137,9 +2638,6 @@ ${content}`;
1137
2638
  }
1138
2639
  else {
1139
2640
  for (const fileName of params.files_to_display) {
1140
- if (fileName === 'todo.md') {
1141
- continue;
1142
- }
1143
2641
  const content = fsInstance.display_file(fileName);
1144
2642
  if (content) {
1145
2643
  attachments.push(fileName);
@@ -1158,8 +2656,24 @@ ${content}`;
1158
2656
  });
1159
2657
  }
1160
2658
  use_structured_output_action(outputModel) {
2659
+ this.outputModel = outputModel;
1161
2660
  this.registerDoneAction(outputModel);
1162
2661
  }
2662
+ get_output_model() {
2663
+ return this.outputModel;
2664
+ }
2665
+ exclude_action(actionName) {
2666
+ this.registry.exclude_action(actionName);
2667
+ }
2668
+ set_coordinate_clicking(enabled) {
2669
+ const resolved = Boolean(enabled);
2670
+ if (resolved === this.coordinateClickingEnabled) {
2671
+ return;
2672
+ }
2673
+ this.coordinateClickingEnabled = resolved;
2674
+ this.registerClickActions();
2675
+ this.logger.debug(`Coordinate clicking ${resolved ? 'enabled' : 'disabled'}`);
2676
+ }
1163
2677
  action(description, options = {}) {
1164
2678
  return this.registry.action(description, options);
1165
2679
  }
@@ -1185,11 +2699,36 @@ ${content}`;
1185
2699
  if (result == null) {
1186
2700
  return new ActionResult();
1187
2701
  }
1188
- return new ActionResult({ extracted_content: JSON.stringify(result) });
2702
+ const resultType = result && typeof result === 'object'
2703
+ ? (result.constructor?.name ?? typeof result)
2704
+ : typeof result;
2705
+ throw new Error(`Invalid action result type: ${resultType} of ${String(result)}`);
1189
2706
  }
1190
2707
  catch (error) {
2708
+ if (error instanceof BrowserError) {
2709
+ if (error.long_term_memory != null) {
2710
+ if (error.short_term_memory != null) {
2711
+ return new ActionResult({
2712
+ extracted_content: error.short_term_memory,
2713
+ error: error.long_term_memory,
2714
+ include_extracted_content_only_once: true,
2715
+ });
2716
+ }
2717
+ return new ActionResult({
2718
+ error: error.long_term_memory,
2719
+ });
2720
+ }
2721
+ throw error;
2722
+ }
2723
+ const message = String(error?.message ?? error ?? '');
2724
+ if (error instanceof Error &&
2725
+ message === `Error executing action ${actionName} due to timeout.`) {
2726
+ return new ActionResult({
2727
+ error: `${actionName} was not executed due to timeout.`,
2728
+ });
2729
+ }
1191
2730
  return new ActionResult({
1192
- error: String(error?.message ?? error ?? ''),
2731
+ error: message,
1193
2732
  });
1194
2733
  }
1195
2734
  }