camel-ai 0.2.73a4__py3-none-any.whl → 0.2.80a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_utils.py +38 -0
  3. camel/agents/chat_agent.py +2217 -519
  4. camel/agents/mcp_agent.py +30 -27
  5. camel/configs/__init__.py +15 -0
  6. camel/configs/aihubmix_config.py +88 -0
  7. camel/configs/amd_config.py +70 -0
  8. camel/configs/cometapi_config.py +104 -0
  9. camel/configs/minimax_config.py +93 -0
  10. camel/configs/nebius_config.py +103 -0
  11. camel/data_collectors/alpaca_collector.py +15 -6
  12. camel/datasets/base_generator.py +39 -10
  13. camel/environments/single_step.py +28 -3
  14. camel/environments/tic_tac_toe.py +1 -1
  15. camel/interpreters/__init__.py +2 -0
  16. camel/interpreters/docker/Dockerfile +3 -12
  17. camel/interpreters/e2b_interpreter.py +34 -1
  18. camel/interpreters/microsandbox_interpreter.py +395 -0
  19. camel/loaders/__init__.py +11 -2
  20. camel/loaders/chunkr_reader.py +9 -0
  21. camel/memories/agent_memories.py +48 -4
  22. camel/memories/base.py +26 -0
  23. camel/memories/blocks/chat_history_block.py +122 -4
  24. camel/memories/context_creators/score_based.py +25 -384
  25. camel/memories/records.py +88 -8
  26. camel/messages/base.py +153 -34
  27. camel/models/__init__.py +10 -0
  28. camel/models/aihubmix_model.py +83 -0
  29. camel/models/aiml_model.py +1 -16
  30. camel/models/amd_model.py +101 -0
  31. camel/models/anthropic_model.py +6 -19
  32. camel/models/aws_bedrock_model.py +2 -33
  33. camel/models/azure_openai_model.py +114 -89
  34. camel/models/base_audio_model.py +3 -1
  35. camel/models/base_model.py +32 -14
  36. camel/models/cohere_model.py +1 -16
  37. camel/models/cometapi_model.py +83 -0
  38. camel/models/crynux_model.py +1 -16
  39. camel/models/deepseek_model.py +1 -16
  40. camel/models/fish_audio_model.py +6 -0
  41. camel/models/gemini_model.py +36 -18
  42. camel/models/groq_model.py +1 -17
  43. camel/models/internlm_model.py +1 -16
  44. camel/models/litellm_model.py +1 -16
  45. camel/models/lmstudio_model.py +1 -17
  46. camel/models/minimax_model.py +83 -0
  47. camel/models/mistral_model.py +1 -16
  48. camel/models/model_factory.py +27 -1
  49. camel/models/modelscope_model.py +1 -16
  50. camel/models/moonshot_model.py +105 -24
  51. camel/models/nebius_model.py +83 -0
  52. camel/models/nemotron_model.py +0 -5
  53. camel/models/netmind_model.py +1 -16
  54. camel/models/novita_model.py +1 -16
  55. camel/models/nvidia_model.py +1 -16
  56. camel/models/ollama_model.py +4 -19
  57. camel/models/openai_compatible_model.py +62 -41
  58. camel/models/openai_model.py +62 -57
  59. camel/models/openrouter_model.py +1 -17
  60. camel/models/ppio_model.py +1 -16
  61. camel/models/qianfan_model.py +1 -16
  62. camel/models/qwen_model.py +1 -16
  63. camel/models/reka_model.py +1 -16
  64. camel/models/samba_model.py +34 -47
  65. camel/models/sglang_model.py +64 -31
  66. camel/models/siliconflow_model.py +1 -16
  67. camel/models/stub_model.py +0 -4
  68. camel/models/togetherai_model.py +1 -16
  69. camel/models/vllm_model.py +1 -16
  70. camel/models/volcano_model.py +0 -17
  71. camel/models/watsonx_model.py +1 -16
  72. camel/models/yi_model.py +1 -16
  73. camel/models/zhipuai_model.py +60 -16
  74. camel/parsers/__init__.py +18 -0
  75. camel/parsers/mcp_tool_call_parser.py +176 -0
  76. camel/retrievers/auto_retriever.py +1 -0
  77. camel/runtimes/daytona_runtime.py +11 -12
  78. camel/societies/__init__.py +2 -0
  79. camel/societies/workforce/__init__.py +2 -0
  80. camel/societies/workforce/events.py +122 -0
  81. camel/societies/workforce/prompts.py +146 -66
  82. camel/societies/workforce/role_playing_worker.py +15 -11
  83. camel/societies/workforce/single_agent_worker.py +302 -65
  84. camel/societies/workforce/structured_output_handler.py +30 -18
  85. camel/societies/workforce/task_channel.py +163 -27
  86. camel/societies/workforce/utils.py +107 -13
  87. camel/societies/workforce/workflow_memory_manager.py +772 -0
  88. camel/societies/workforce/workforce.py +1949 -579
  89. camel/societies/workforce/workforce_callback.py +74 -0
  90. camel/societies/workforce/workforce_logger.py +168 -145
  91. camel/societies/workforce/workforce_metrics.py +33 -0
  92. camel/storages/key_value_storages/json.py +15 -2
  93. camel/storages/key_value_storages/mem0_cloud.py +48 -47
  94. camel/storages/object_storages/google_cloud.py +1 -1
  95. camel/storages/vectordb_storages/oceanbase.py +13 -13
  96. camel/storages/vectordb_storages/qdrant.py +3 -3
  97. camel/storages/vectordb_storages/tidb.py +8 -6
  98. camel/tasks/task.py +4 -3
  99. camel/toolkits/__init__.py +20 -7
  100. camel/toolkits/aci_toolkit.py +45 -0
  101. camel/toolkits/base.py +6 -4
  102. camel/toolkits/code_execution.py +28 -1
  103. camel/toolkits/context_summarizer_toolkit.py +684 -0
  104. camel/toolkits/dappier_toolkit.py +5 -1
  105. camel/toolkits/dingtalk.py +1135 -0
  106. camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
  107. camel/toolkits/excel_toolkit.py +1 -1
  108. camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +430 -36
  109. camel/toolkits/function_tool.py +13 -3
  110. camel/toolkits/github_toolkit.py +104 -17
  111. camel/toolkits/gmail_toolkit.py +1839 -0
  112. camel/toolkits/google_calendar_toolkit.py +38 -4
  113. camel/toolkits/google_drive_mcp_toolkit.py +12 -31
  114. camel/toolkits/hybrid_browser_toolkit/config_loader.py +15 -0
  115. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +77 -8
  116. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +884 -88
  117. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  118. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
  119. camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
  120. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +959 -89
  121. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +9 -2
  122. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +281 -213
  123. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  124. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  125. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  126. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +23 -3
  127. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +72 -7
  128. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +582 -132
  129. camel/toolkits/hybrid_browser_toolkit_py/actions.py +158 -0
  130. camel/toolkits/hybrid_browser_toolkit_py/browser_session.py +55 -8
  131. camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +43 -0
  132. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +321 -8
  133. camel/toolkits/hybrid_browser_toolkit_py/snapshot.py +10 -4
  134. camel/toolkits/hybrid_browser_toolkit_py/unified_analyzer.js +45 -4
  135. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +151 -53
  136. camel/toolkits/klavis_toolkit.py +5 -1
  137. camel/toolkits/markitdown_toolkit.py +27 -1
  138. camel/toolkits/math_toolkit.py +64 -10
  139. camel/toolkits/mcp_toolkit.py +366 -71
  140. camel/toolkits/memory_toolkit.py +5 -1
  141. camel/toolkits/message_integration.py +18 -13
  142. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  143. camel/toolkits/note_taking_toolkit.py +19 -10
  144. camel/toolkits/notion_mcp_toolkit.py +16 -26
  145. camel/toolkits/openbb_toolkit.py +5 -1
  146. camel/toolkits/origene_mcp_toolkit.py +8 -49
  147. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  148. camel/toolkits/resend_toolkit.py +168 -0
  149. camel/toolkits/search_toolkit.py +264 -91
  150. camel/toolkits/slack_toolkit.py +64 -10
  151. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  152. camel/toolkits/terminal_toolkit/terminal_toolkit.py +957 -0
  153. camel/toolkits/terminal_toolkit/utils.py +532 -0
  154. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  155. camel/toolkits/video_analysis_toolkit.py +17 -11
  156. camel/toolkits/wechat_official_toolkit.py +483 -0
  157. camel/toolkits/zapier_toolkit.py +5 -1
  158. camel/types/__init__.py +2 -2
  159. camel/types/enums.py +274 -7
  160. camel/types/openai_types.py +2 -2
  161. camel/types/unified_model_type.py +15 -0
  162. camel/utils/commons.py +36 -5
  163. camel/utils/constants.py +3 -0
  164. camel/utils/context_utils.py +1003 -0
  165. camel/utils/mcp.py +138 -4
  166. camel/utils/token_counting.py +43 -20
  167. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/METADATA +223 -83
  168. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/RECORD +170 -141
  169. camel/loaders/pandas_reader.py +0 -368
  170. camel/toolkits/openai_agent_toolkit.py +0 -135
  171. camel/toolkits/terminal_toolkit.py +0 -1550
  172. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/WHEEL +0 -0
  173. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/licenses/LICENSE +0 -0
@@ -13,22 +13,39 @@
13
13
  # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
14
  # =========
15
15
 
16
+ import contextlib
16
17
  import time
17
- from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
18
+ from typing import (
19
+ Any,
20
+ Callable,
21
+ ClassVar,
22
+ Dict,
23
+ List,
24
+ Optional,
25
+ TypedDict,
26
+ cast,
27
+ )
18
28
 
19
29
  from camel.logger import get_logger
20
30
  from camel.messages import BaseMessage
21
- from camel.models import BaseModelBackend
22
31
  from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
23
32
  from camel.toolkits.function_tool import FunctionTool
24
33
  from camel.utils.commons import dependencies_required
25
34
 
26
35
  from .config_loader import ConfigLoader
27
- from .ws_wrapper import WebSocketBrowserWrapper
36
+ from .ws_wrapper import WebSocketBrowserWrapper, high_level_action
28
37
 
29
38
  logger = get_logger(__name__)
30
39
 
31
40
 
41
+ class SheetCell(TypedDict):
42
+ """Type definition for a sheet cell input."""
43
+
44
+ row: int
45
+ col: int
46
+ text: str
47
+
48
+
32
49
  class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
33
50
  r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
34
51
  automation with visual, screenshot-based capabilities.
@@ -37,7 +54,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
37
54
  _snapshotForAI functionality for enhanced AI integration.
38
55
  """
39
56
 
40
- # Default tool list - core browser functionality
41
57
  DEFAULT_TOOLS: ClassVar[List[str]] = [
42
58
  "browser_open",
43
59
  "browser_close",
@@ -49,7 +65,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
49
65
  "browser_switch_tab",
50
66
  ]
51
67
 
52
- # All available tools
53
68
  ALL_TOOLS: ClassVar[List[str]] = [
54
69
  "browser_open",
55
70
  "browser_close",
@@ -58,17 +73,22 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
58
73
  "browser_forward",
59
74
  "browser_get_page_snapshot",
60
75
  "browser_get_som_screenshot",
61
- "browser_get_page_links",
62
76
  "browser_click",
63
77
  "browser_type",
64
78
  "browser_select",
65
79
  "browser_scroll",
66
80
  "browser_enter",
81
+ "browser_mouse_control",
82
+ "browser_mouse_drag",
83
+ "browser_press_key",
67
84
  "browser_wait_user",
68
- "browser_solve_task",
69
85
  "browser_switch_tab",
70
86
  "browser_close_tab",
71
87
  "browser_get_tab_info",
88
+ "browser_console_view",
89
+ "browser_console_exec",
90
+ "browser_sheet_input",
91
+ "browser_sheet_read",
72
92
  ]
73
93
 
74
94
  def __init__(
@@ -77,12 +97,12 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
77
97
  headless: bool = True,
78
98
  user_data_dir: Optional[str] = None,
79
99
  stealth: bool = False,
80
- web_agent_model: Optional[BaseModelBackend] = None,
81
- cache_dir: str = "tmp/",
100
+ cache_dir: Optional[str] = None,
82
101
  enabled_tools: Optional[List[str]] = None,
83
102
  browser_log_to_file: bool = False,
103
+ log_dir: Optional[str] = None,
84
104
  session_id: Optional[str] = None,
85
- default_start_url: str = "https://google.com/",
105
+ default_start_url: Optional[str] = None,
86
106
  default_timeout: Optional[int] = None,
87
107
  short_timeout: Optional[int] = None,
88
108
  navigation_timeout: Optional[int] = None,
@@ -93,6 +113,8 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
93
113
  viewport_limit: bool = False,
94
114
  connect_over_cdp: bool = False,
95
115
  cdp_url: Optional[str] = None,
116
+ cdp_keep_current_page: bool = False,
117
+ full_visual_mode: bool = False,
96
118
  ) -> None:
97
119
  r"""Initialize the HybridBrowserToolkit.
98
120
 
@@ -103,13 +125,13 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
103
125
  persistence. Defaults to None.
104
126
  stealth (bool): Whether to enable stealth mode. Defaults to
105
127
  False.
106
- web_agent_model (Optional[BaseModelBackend]): Model for web
107
- agent operations. Defaults to None.
108
128
  cache_dir (str): Directory for caching. Defaults to "tmp/".
109
129
  enabled_tools (Optional[List[str]]): List of enabled tools.
110
130
  Defaults to None.
111
131
  browser_log_to_file (bool): Whether to log browser actions to
112
132
  file. Defaults to False.
133
+ log_dir (Optional[str]): Custom directory path for log files.
134
+ If None, defaults to "browser_log". Defaults to None.
113
135
  session_id (Optional[str]): Session identifier. Defaults to None.
114
136
  default_start_url (str): Default URL to start with. Defaults
115
137
  to "https://google.com/".
@@ -138,11 +160,15 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
138
160
  cdp_url (Optional[str]): WebSocket endpoint URL for CDP
139
161
  connection (e.g., 'ws://localhost:9222/devtools/browser/...').
140
162
  Required when connect_over_cdp is True. Defaults to None.
163
+ cdp_keep_current_page (bool): When True and using CDP mode,
164
+ won't create new pages but use the existing one. Defaults to False.
165
+ full_visual_mode (bool): When True, browser actions like click,
166
+ browser_open, visit_page, etc. will not return snapshots.
167
+ Defaults to False.
141
168
  """
142
169
  super().__init__()
143
170
  RegisteredAgentToolkit.__init__(self)
144
171
 
145
- # Initialize configuration loader
146
172
  self.config_loader = ConfigLoader.from_kwargs(
147
173
  headless=headless,
148
174
  user_data_dir=user_data_dir,
@@ -158,27 +184,39 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
158
184
  viewport_limit=viewport_limit,
159
185
  cache_dir=cache_dir,
160
186
  browser_log_to_file=browser_log_to_file,
187
+ log_dir=log_dir,
161
188
  session_id=session_id,
162
189
  enabled_tools=enabled_tools,
163
190
  connect_over_cdp=connect_over_cdp,
164
191
  cdp_url=cdp_url,
192
+ cdp_keep_current_page=cdp_keep_current_page,
193
+ full_visual_mode=full_visual_mode,
165
194
  )
166
195
 
167
- # Legacy attribute access for backward compatibility
168
196
  browser_config = self.config_loader.get_browser_config()
169
197
  toolkit_config = self.config_loader.get_toolkit_config()
170
198
 
199
+ if (
200
+ browser_config.cdp_keep_current_page
201
+ and default_start_url is not None
202
+ ):
203
+ raise ValueError(
204
+ "Cannot use default_start_url with "
205
+ "cdp_keep_current_page=True. When cdp_keep_current_page "
206
+ "is True, the browser will keep the current page and not "
207
+ "navigate to any URL."
208
+ )
209
+
171
210
  self._headless = browser_config.headless
172
211
  self._user_data_dir = browser_config.user_data_dir
173
212
  self._stealth = browser_config.stealth
174
- self._web_agent_model = web_agent_model
175
213
  self._cache_dir = toolkit_config.cache_dir
176
214
  self._browser_log_to_file = toolkit_config.browser_log_to_file
177
215
  self._default_start_url = browser_config.default_start_url
178
216
  self._session_id = toolkit_config.session_id or "default"
179
217
  self._viewport_limit = browser_config.viewport_limit
218
+ self._full_visual_mode = browser_config.full_visual_mode
180
219
 
181
- # Store timeout configuration for backward compatibility
182
220
  self._default_timeout = browser_config.default_timeout
183
221
  self._short_timeout = browser_config.short_timeout
184
222
  self._navigation_timeout = browser_config.navigation_timeout
@@ -189,11 +227,9 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
189
227
  browser_config.dom_content_loaded_timeout
190
228
  )
191
229
 
192
- # Configure enabled tools
193
230
  if enabled_tools is None:
194
231
  self.enabled_tools = self.DEFAULT_TOOLS.copy()
195
232
  else:
196
- # Validate enabled tools
197
233
  invalid_tools = [
198
234
  tool for tool in enabled_tools if tool not in self.ALL_TOOLS
199
235
  ]
@@ -206,7 +242,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
206
242
 
207
243
  logger.info(f"Enabled tools: {self.enabled_tools}")
208
244
 
209
- # Initialize WebSocket wrapper
210
245
  self._ws_wrapper: Optional[WebSocketBrowserWrapper] = None
211
246
  self._ws_config = self.config_loader.to_ws_config()
212
247
 
@@ -233,13 +268,29 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
233
268
 
234
269
  import asyncio
235
270
 
271
+ is_cdp = (
272
+ self._ws_config.get('connectOverCdp', False)
273
+ if hasattr(self, '_ws_config')
274
+ else False
275
+ )
276
+
236
277
  try:
237
278
  loop = asyncio.get_event_loop()
238
279
  if not loop.is_closed() and not loop.is_running():
239
280
  try:
240
- loop.run_until_complete(
241
- asyncio.wait_for(self.browser_close(), timeout=2.0)
242
- )
281
+ if is_cdp:
282
+ # CDP: disconnect only
283
+ loop.run_until_complete(
284
+ asyncio.wait_for(
285
+ self.disconnect_websocket(), timeout=2.0
286
+ )
287
+ )
288
+ else:
289
+ loop.run_until_complete(
290
+ asyncio.wait_for(
291
+ self.browser_close(), timeout=2.0
292
+ )
293
+ )
243
294
  except asyncio.TimeoutError:
244
295
  pass
245
296
  except (RuntimeError, ImportError):
@@ -247,23 +298,11 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
247
298
  except Exception:
248
299
  pass
249
300
 
250
- @property
251
- def web_agent_model(self) -> Optional[BaseModelBackend]:
252
- """Get the web agent model."""
253
- return self._web_agent_model
254
-
255
- @web_agent_model.setter
256
- def web_agent_model(self, value: Optional[BaseModelBackend]) -> None:
257
- """Set the web agent model."""
258
- self._web_agent_model = value
259
-
260
301
  @property
261
302
  def cache_dir(self) -> str:
262
303
  """Get the cache directory."""
263
304
  return self._cache_dir
264
305
 
265
- # Public API Methods
266
-
267
306
  async def browser_open(self) -> Dict[str, Any]:
268
307
  r"""Starts a new browser session. This must be the first browser
269
308
  action.
@@ -284,7 +323,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
284
323
  ws_wrapper = await self._get_ws_wrapper()
285
324
  result = await ws_wrapper.open_browser(self._default_start_url)
286
325
 
287
- # Add tab information
288
326
  tab_info = await ws_wrapper.get_tab_info()
289
327
  result.update(
290
328
  {
@@ -329,6 +367,31 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
329
367
  logger.error(f"Failed to close browser: {e}")
330
368
  return f"Error closing browser: {e}"
331
369
 
370
+ async def disconnect_websocket(self) -> str:
371
+ r"""Disconnects the WebSocket connection without closing the browser.
372
+
373
+ This is useful when using CDP mode where the browser should
374
+ remain open.
375
+
376
+ Returns:
377
+ str: A confirmation message.
378
+ """
379
+ try:
380
+ if self._ws_wrapper:
381
+ is_cdp = self._ws_config.get('connectOverCdp', False)
382
+
383
+ if is_cdp:
384
+ # CDP: disconnect only
385
+ await self._ws_wrapper.disconnect_only()
386
+ else:
387
+ await self._ws_wrapper.stop()
388
+
389
+ self._ws_wrapper = None
390
+ return "WebSocket disconnected."
391
+ except Exception as e:
392
+ logger.error(f"Failed to disconnect WebSocket: {e}")
393
+ return f"Error disconnecting WebSocket: {e}"
394
+
332
395
  async def browser_visit_page(self, url: str) -> Dict[str, Any]:
333
396
  r"""Opens a URL in a new browser tab and switches to it.
334
397
 
@@ -348,7 +411,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
348
411
  ws_wrapper = await self._get_ws_wrapper()
349
412
  result = await ws_wrapper.visit_page(url)
350
413
 
351
- # Add tab information
352
414
  tab_info = await ws_wrapper.get_tab_info()
353
415
  result.update(
354
416
  {
@@ -394,7 +456,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
394
456
  ws_wrapper = await self._get_ws_wrapper()
395
457
  result = await ws_wrapper.back()
396
458
 
397
- # Add tab information
398
459
  tab_info = await ws_wrapper.get_tab_info()
399
460
  result.update(
400
461
  {
@@ -440,7 +501,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
440
501
  ws_wrapper = await self._get_ws_wrapper()
441
502
  result = await ws_wrapper.forward()
442
503
 
443
- # Add tab information
444
504
  tab_info = await ws_wrapper.get_tab_info()
445
505
  result.update(
446
506
  {
@@ -532,19 +592,14 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
532
592
  ws_wrapper = await self._get_ws_wrapper()
533
593
  result = await ws_wrapper.get_som_screenshot()
534
594
 
535
- # Initialize result text
536
595
  result_text = result.text
537
596
  file_path = None
538
597
 
539
- # Save screenshot to cache directory if images are available
540
598
  if result.images:
541
- # Ensure cache directory exists (use absolute path)
542
599
  cache_dir = os.path.abspath(self._cache_dir)
543
600
  os.makedirs(cache_dir, exist_ok=True)
544
601
 
545
- # Get current page URL for filename
546
602
  try:
547
- # Try to get the current page URL from the wrapper
548
603
  page_info = await ws_wrapper.get_tab_info()
549
604
  current_tab = next(
550
605
  (tab for tab in page_info if tab.get('is_current')),
@@ -554,7 +609,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
554
609
  except Exception:
555
610
  url = 'unknown'
556
611
 
557
- # Generate filename
558
612
  parsed_url = urllib.parse.urlparse(url)
559
613
  url_name = sanitize_filename(
560
614
  str(parsed_url.path) or 'homepage', max_length=241
@@ -564,24 +618,19 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
564
618
  cache_dir, f"{url_name}_{timestamp}_som.png"
565
619
  )
566
620
 
567
- # Extract base64 data and save to file
568
621
  for _, image_data in enumerate(result.images):
569
622
  if image_data.startswith('data:image/png;base64,'):
570
- # Remove data URL prefix
571
623
  base64_data = image_data.split(',', 1)[1]
572
624
 
573
- # Decode and save
574
625
  image_bytes = base64.b64decode(base64_data)
575
626
  with open(file_path, 'wb') as f:
576
627
  f.write(image_bytes)
577
628
 
578
629
  logger.info(f"Screenshot saved to: {file_path}")
579
630
 
580
- # Update result text to include file path
581
631
  result_text += f" (saved to: {file_path})"
582
632
  break
583
633
 
584
- # Analyze image if requested and agent is registered
585
634
  if read_image and file_path:
586
635
  if self.agent is None:
587
636
  logger.error(
@@ -596,7 +645,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
596
645
  )
597
646
  else:
598
647
  try:
599
- # Load the image and create a message
600
648
  from PIL import Image
601
649
 
602
650
  img = Image.open(file_path)
@@ -607,7 +655,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
607
655
  image_list=[img],
608
656
  )
609
657
 
610
- # Get agent's analysis
611
658
  response = await self.agent.astep(message)
612
659
  agent_response = response.msgs[0].content
613
660
  result_text += f". Agent analysis: {agent_response}"
@@ -641,24 +688,30 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
641
688
  ws_wrapper = await self._get_ws_wrapper()
642
689
  result = await ws_wrapper.click(ref)
643
690
 
644
- # Add tab information
645
691
  tab_info = await ws_wrapper.get_tab_info()
646
- result.update(
647
- {
648
- "tabs": tab_info,
649
- "current_tab": next(
650
- (
651
- i
652
- for i, tab in enumerate(tab_info)
653
- if tab.get("is_current")
654
- ),
655
- 0,
692
+
693
+ response = {
694
+ "result": result.get("result", ""),
695
+ "snapshot": result.get("snapshot", ""),
696
+ "tabs": tab_info,
697
+ "current_tab": next(
698
+ (
699
+ i
700
+ for i, tab in enumerate(tab_info)
701
+ if tab.get("is_current")
656
702
  ),
657
- "total_tabs": len(tab_info),
658
- }
659
- )
703
+ 0,
704
+ ),
705
+ "total_tabs": len(tab_info),
706
+ }
660
707
 
661
- return result
708
+ if "newTabId" in result:
709
+ response["newTabId"] = result["newTabId"]
710
+
711
+ if "timing" in result:
712
+ response["timing"] = result["timing"]
713
+
714
+ return response
662
715
  except Exception as e:
663
716
  logger.error(f"Failed to click element: {e}")
664
717
  return {
@@ -669,12 +722,29 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
669
722
  "total_tabs": 0,
670
723
  }
671
724
 
672
- async def browser_type(self, *, ref: str, text: str) -> Dict[str, Any]:
673
- r"""Types text into an input element on the page.
725
+ async def browser_type(
726
+ self,
727
+ *,
728
+ ref: Optional[str] = None,
729
+ text: Optional[str] = None,
730
+ inputs: Optional[List[Dict[str, str]]] = None,
731
+ ) -> Dict[str, Any]:
732
+ r"""Types text into one or more input elements on the page.
733
+
734
+ This method supports two modes:
735
+ 1. Single input mode (backward compatible): Provide 'ref' and 'text'
736
+ 2. Multiple inputs mode: Provide 'inputs' as a list of dictionaries
737
+ with 'ref' and 'text' keys
674
738
 
675
739
  Args:
676
- ref (str): The `ref` ID of the input element, from a snapshot.
677
- text (str): The text to type into the element.
740
+ ref (Optional[str]): The `ref` ID of the input element, from a
741
+ snapshot. Required when using single input mode.
742
+ text (Optional[str]): The text to type into the element. Required
743
+ when using single input mode.
744
+ inputs (Optional[List[Dict[str, str]]]): List of dictionaries,
745
+ each containing 'ref' and 'text' keys for typing into multiple
746
+ elements. Example: [{'ref': '1', 'text': 'username'},
747
+ {'ref': '2', 'text': 'password'}]
678
748
 
679
749
  Returns:
680
750
  Dict[str, Any]: A dictionary with the result of the action:
@@ -684,12 +754,22 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
684
754
  - "tabs" (List[Dict]): Information about all open tabs.
685
755
  - "current_tab" (int): Index of the active tab.
686
756
  - "total_tabs" (int): Total number of open tabs.
757
+ - "details" (Dict[str, Any]): When using multiple inputs,
758
+ contains success/error status for each ref.
687
759
  """
688
760
  try:
689
761
  ws_wrapper = await self._get_ws_wrapper()
690
- result = await ws_wrapper.type(ref, text)
691
762
 
692
- # Add tab information
763
+ if ref is not None and text is not None:
764
+ result = await ws_wrapper.type(ref, text)
765
+ elif inputs is not None:
766
+ result = await ws_wrapper.type_multiple(inputs)
767
+ else:
768
+ raise ValueError(
769
+ "Either provide 'ref' and 'text' for single input, "
770
+ "or 'inputs' for multiple inputs"
771
+ )
772
+
693
773
  tab_info = await ws_wrapper.get_tab_info()
694
774
  result.update(
695
775
  {
@@ -738,7 +818,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
738
818
  ws_wrapper = await self._get_ws_wrapper()
739
819
  result = await ws_wrapper.select(ref, value)
740
820
 
741
- # Add tab information
742
821
  tab_info = await ws_wrapper.get_tab_info()
743
822
  result.update(
744
823
  {
@@ -787,7 +866,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
787
866
  ws_wrapper = await self._get_ws_wrapper()
788
867
  result = await ws_wrapper.scroll(direction, amount)
789
868
 
790
- # Add tab information
791
869
  tab_info = await ws_wrapper.get_tab_info()
792
870
  result.update(
793
871
  {
@@ -835,7 +913,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
835
913
  ws_wrapper = await self._get_ws_wrapper()
836
914
  result = await ws_wrapper.enter()
837
915
 
838
- # Add tab information
839
916
  tab_info = await ws_wrapper.get_tab_info()
840
917
  result.update(
841
918
  {
@@ -863,6 +940,153 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
863
940
  "total_tabs": 0,
864
941
  }
865
942
 
943
+ async def browser_mouse_control(
944
+ self, *, control: str, x: float, y: float
945
+ ) -> Dict[str, Any]:
946
+ r"""Control the mouse to interact with browser with x, y coordinates
947
+
948
+ Args:
949
+ control ([str]): The action to perform: 'click', 'right_click'
950
+ or 'dblclick'.
951
+ x (float): x-coordinate for the control action.
952
+ y (float): y-coordinate for the control action.
953
+
954
+ Returns:
955
+ Dict[str, Any]: A dictionary with the result of the action:
956
+ - "result" (str): Confirmation of the action.
957
+ - "snapshot" (str): A snapshot of the page after mouse
958
+ control action.
959
+ - "tabs" (List[Dict]): Information about all open tabs.
960
+ - "current_tab" (int): Index of the active tab.
961
+ - "total_tabs" (int): Total number of open tabs.
962
+ """
963
+ try:
964
+ ws_wrapper = await self._get_ws_wrapper()
965
+ result = await ws_wrapper.mouse_control(control, x, y)
966
+
967
+ tab_info = await ws_wrapper.get_tab_info()
968
+ result.update(
969
+ {
970
+ "tabs": tab_info,
971
+ "current_tab": next(
972
+ (
973
+ i
974
+ for i, tab in enumerate(tab_info)
975
+ if tab.get("is_current")
976
+ ),
977
+ 0,
978
+ ),
979
+ "total_tabs": len(tab_info),
980
+ }
981
+ )
982
+
983
+ return result
984
+ except Exception as e:
985
+ logger.error(f"Failed to control mouse: {e}")
986
+ return {
987
+ "result": f"Error with mouse control: {e}",
988
+ "snapshot": "",
989
+ "tabs": [],
990
+ "current_tab": 0,
991
+ "total_tabs": 0,
992
+ }
993
+
994
+ async def browser_mouse_drag(
995
+ self, *, from_ref: str, to_ref: str
996
+ ) -> Dict[str, Any]:
997
+ r"""Control the mouse to drag and drop in the browser using ref IDs.
998
+
999
+ Args:
1000
+ from_ref (str): The `ref` ID of the source element to drag from.
1001
+ to_ref (str): The `ref` ID of the target element to drag to.
1002
+
1003
+ Returns:
1004
+ Dict[str, Any]: A dictionary with the result of the action:
1005
+ - "result" (str): Confirmation of the action.
1006
+ - "snapshot" (str): A new page snapshot.
1007
+ - "tabs" (List[Dict]): Information about all open tabs.
1008
+ - "current_tab" (int): Index of the active tab.
1009
+ - "total_tabs" (int): Total number of open tabs.
1010
+ """
1011
+ try:
1012
+ ws_wrapper = await self._get_ws_wrapper()
1013
+ result = await ws_wrapper.mouse_drag(from_ref, to_ref)
1014
+
1015
+ tab_info = await ws_wrapper.get_tab_info()
1016
+ result.update(
1017
+ {
1018
+ "tabs": tab_info,
1019
+ "current_tab": next(
1020
+ (
1021
+ i
1022
+ for i, tab in enumerate(tab_info)
1023
+ if tab.get("is_current")
1024
+ ),
1025
+ 0,
1026
+ ),
1027
+ "total_tabs": len(tab_info),
1028
+ }
1029
+ )
1030
+
1031
+ return result
1032
+ except Exception as e:
1033
+ logger.error(f"Error with mouse drag and drop: {e}")
1034
+ return {
1035
+ "result": f"Error with mouse drag and drop: {e}",
1036
+ "snapshot": "",
1037
+ "tabs": [],
1038
+ "current_tab": 0,
1039
+ "total_tabs": 0,
1040
+ }
1041
+
1042
+ async def browser_press_key(self, *, keys: List[str]) -> Dict[str, Any]:
1043
+ r"""Press key and key combinations.
1044
+ Supports single key press or combination of keys by concatenating
1045
+ them with '+' separator.
1046
+
1047
+ Args:
1048
+ keys (List[str]): key or list of keys.
1049
+
1050
+ Returns:
1051
+ Dict[str, Any]: A dictionary with the result of the action:
1052
+ - "result" (str): Confirmation of the action.
1053
+ - "snapshot" (str): A snapshot of the page after
1054
+ press key action.
1055
+ - "tabs" (List[Dict]): Information about all open tabs.
1056
+ - "current_tab" (int): Index of the active tab.
1057
+ - "total_tabs" (int): Total number of open tabs.
1058
+ """
1059
+ try:
1060
+ ws_wrapper = await self._get_ws_wrapper()
1061
+ result = await ws_wrapper.press_key(keys)
1062
+
1063
+ tab_info = await ws_wrapper.get_tab_info()
1064
+ result.update(
1065
+ {
1066
+ "tabs": tab_info,
1067
+ "current_tab": next(
1068
+ (
1069
+ i
1070
+ for i, tab in enumerate(tab_info)
1071
+ if tab.get("is_current")
1072
+ ),
1073
+ 0,
1074
+ ),
1075
+ "total_tabs": len(tab_info),
1076
+ }
1077
+ )
1078
+
1079
+ return result
1080
+ except Exception as e:
1081
+ logger.error(f"Failed to press key: {e}")
1082
+ return {
1083
+ "result": f"Error with press key: {e}",
1084
+ "snapshot": "",
1085
+ "tabs": [],
1086
+ "current_tab": 0,
1087
+ "total_tabs": 0,
1088
+ }
1089
+
866
1090
  async def browser_switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
867
1091
  r"""Switches to a different browser tab using its ID.
868
1092
 
@@ -884,7 +1108,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
884
1108
  ws_wrapper = await self._get_ws_wrapper()
885
1109
  result = await ws_wrapper.switch_tab(tab_id)
886
1110
 
887
- # Add tab information
888
1111
  tab_info = await ws_wrapper.get_tab_info()
889
1112
  result.update(
890
1113
  {
@@ -934,7 +1157,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
934
1157
  ws_wrapper = await self._get_ws_wrapper()
935
1158
  result = await ws_wrapper.close_tab(tab_id)
936
1159
 
937
- # Add tab information
938
1160
  tab_info = await ws_wrapper.get_tab_info()
939
1161
  result.update(
940
1162
  {
@@ -1002,6 +1224,582 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
1002
1224
  "total_tabs": 0,
1003
1225
  }
1004
1226
 
1227
+ async def browser_console_view(self) -> Dict[str, Any]:
1228
+ r"""View current page console logs.
1229
+
1230
+ Returns:
1231
+ Dict[str, Any]: A dictionary with tab information:
1232
+ - "console_messages" (List[Dict]) : List of messages logged
1233
+ in the current page
1234
+
1235
+ """
1236
+ try:
1237
+ ws_wrapper = await self._get_ws_wrapper()
1238
+ console_logs = await ws_wrapper.console_view()
1239
+
1240
+ return {"console_messages": console_logs}
1241
+ except Exception as e:
1242
+ logger.error(f"Failed to get console view: {e}")
1243
+ return {"console_messages": []}
1244
+
1245
+ async def browser_console_exec(self, code: str) -> Dict[str, Any]:
1246
+ r"""Execute javascript code in the console of the current page and get
1247
+ results.
1248
+
1249
+ Args:
1250
+ code (str): JavaScript code to execute in the browser console.
1251
+
1252
+ Returns:
1253
+ Dict[str, Any]: A dictionary with the result of the action:
1254
+ - "result" (str): Confirmation of the action.
1255
+ - "snapshot" (str): A snapshot of the active tab after
1256
+ console execute action.
1257
+ - "tabs" (List[Dict]): Information about remaining tabs.
1258
+ - "current_tab" (int): Index of the new active tab.
1259
+ - "total_tabs" (int): Total number of remaining tabs.
1260
+ """
1261
+ try:
1262
+ ws_wrapper = await self._get_ws_wrapper()
1263
+ result = await ws_wrapper.console_exec(code)
1264
+
1265
+ tab_info = await ws_wrapper.get_tab_info()
1266
+ result.update(
1267
+ {
1268
+ "tabs": tab_info,
1269
+ "current_tab": next(
1270
+ (
1271
+ i
1272
+ for i, tab in enumerate(tab_info)
1273
+ if tab.get("is_current")
1274
+ ),
1275
+ 0,
1276
+ ),
1277
+ "total_tabs": len(tab_info),
1278
+ }
1279
+ )
1280
+
1281
+ return result
1282
+ except Exception as e:
1283
+ logger.error(f"Failed to execute javascript in console: {e}")
1284
+ return {
1285
+ "result": f"Error in code execution: {e}",
1286
+ "snapshot": "",
1287
+ "tabs": [],
1288
+ "current_tab": 0,
1289
+ "total_tabs": 0,
1290
+ }
1291
+
1292
+ @high_level_action
1293
+ async def browser_sheet_input(
1294
+ self, *, cells: List[SheetCell]
1295
+ ) -> Dict[str, Any]:
1296
+ r"""Input text into multiple cells in a spreadsheet (e.g., Google
1297
+ Sheets).
1298
+
1299
+ Args:
1300
+ cells (List[Dict[str, Any]]): List of cells to input, each
1301
+ containing:
1302
+ - "row" (int): Row index (0-based). Row 0 = first row,
1303
+ Row 1 = second row, etc.
1304
+ - "col" (int): Column index (0-based). Col 0 = Column A,
1305
+ Col 1 = Column B, etc.
1306
+ - "text" (str): Text to input into the cell
1307
+
1308
+ Returns:
1309
+ Dict[str, Any]: A dictionary with the result of the action:
1310
+ - "result" (str): Confirmation of the action with details.
1311
+ - "content" (str): The updated spreadsheet content (auto-read
1312
+ after input).
1313
+ - "snapshot" (str): Always empty string (sheet tools don't
1314
+ return snapshots).
1315
+ - "tabs" (List[Dict]): Information about all open tabs.
1316
+ - "current_tab" (int): Index of the active tab.
1317
+ - "total_tabs" (int): Total number of open tabs.
1318
+
1319
+ Example:
1320
+ >>> cells = [
1321
+ ... {"row": 0, "col": 0, "text": "Name"},
1322
+ ... {"row": 0, "col": 1, "text": "Age"},
1323
+ ... {"row": 1, "col": 0, "text": "Alice"},
1324
+ ... {"row": 1, "col": 1, "text": "30"},
1325
+ ... ]
1326
+ """
1327
+ try:
1328
+ import platform
1329
+
1330
+ ws_wrapper = await self._get_ws_wrapper()
1331
+ system = platform.system()
1332
+
1333
+ # Normalize cells: convert column labels to indices if needed
1334
+ normalized_cells = []
1335
+ for cell in cells:
1336
+ normalized_cell = cell.copy()
1337
+
1338
+ # Convert column label (A, B, C, ...) to index if it's a string
1339
+ col = cell.get("col", 0)
1340
+ if isinstance(col, str):
1341
+ col = col.strip().upper()
1342
+ # Convert A->0, B->1, ..., Z->25, AA->26, AB->27, etc.
1343
+ col_index = 0
1344
+ for char in col:
1345
+ col_index = col_index * 26 + (ord(char) - ord('A') + 1)
1346
+ normalized_cell["col"] = col_index - 1
1347
+ else:
1348
+ normalized_cell["col"] = int(col)
1349
+
1350
+ # Row is always used as-is (should be 0-based integer)
1351
+ normalized_cell["row"] = int(cell.get("row", 0))
1352
+ normalized_cell["text"] = str(cell.get("text", ""))
1353
+ normalized_cells.append(normalized_cell)
1354
+
1355
+ # Perform batch input
1356
+ input_result = await self._sheet_input_batch_js(
1357
+ normalized_cells, ws_wrapper, system
1358
+ )
1359
+
1360
+ # Read sheet content after input
1361
+ try:
1362
+ read_result = await self.browser_sheet_read()
1363
+ return {
1364
+ "result": input_result["result"],
1365
+ "content": read_result.get("content", ""),
1366
+ "snapshot": "",
1367
+ "tabs": input_result.get("tabs", []),
1368
+ "current_tab": input_result.get("current_tab", 0),
1369
+ "total_tabs": input_result.get("total_tabs", 0),
1370
+ }
1371
+ except Exception as read_error:
1372
+ logger.warning(f"Failed to auto-read sheet: {read_error}")
1373
+ input_result["snapshot"] = ""
1374
+ return input_result
1375
+
1376
+ except Exception as e:
1377
+ logger.error(f"Failed to input to sheet: {e}")
1378
+ return {
1379
+ "result": f"Error inputting to sheet: {e}",
1380
+ "content": "",
1381
+ "snapshot": "",
1382
+ "tabs": [],
1383
+ "current_tab": 0,
1384
+ "total_tabs": 0,
1385
+ }
1386
+
1387
+ async def _sheet_input_batch_js(
1388
+ self,
1389
+ cells: List[SheetCell],
1390
+ ws_wrapper: Any,
1391
+ system: str,
1392
+ ) -> Dict[str, Any]:
1393
+ r"""Input to sheet using batch keyboard input with relative
1394
+ positioning.
1395
+
1396
+ Builds all operations and sends them in ONE command to TypeScript,
1397
+ which executes them and only waits for stability once at the end.
1398
+ """
1399
+ operations: List[Dict[str, Any]] = []
1400
+
1401
+ # Go to A1 to ensure we start from a known position
1402
+ if system == "Darwin":
1403
+ operations.append({"type": "press", "keys": ["Meta", "Home"]})
1404
+ else:
1405
+ operations.append({"type": "press", "keys": ["Control", "Home"]})
1406
+ operations.append({"type": "wait", "delay": 310})
1407
+
1408
+ # Start at (0, 0)
1409
+ current_row = 0
1410
+ current_col = 0
1411
+
1412
+ for cell in cells:
1413
+ target_row = cell.get("row", 0)
1414
+ target_col = cell.get("col", 0)
1415
+ text = cell.get("text", "")
1416
+
1417
+ # Calculate relative movement needed
1418
+ row_diff = target_row - current_row
1419
+ col_diff = target_col - current_col
1420
+
1421
+ # Navigate vertically
1422
+ if row_diff > 0:
1423
+ for _ in range(row_diff):
1424
+ operations.append({"type": "press", "keys": ["ArrowDown"]})
1425
+ operations.append({"type": "wait", "delay": 50})
1426
+ elif row_diff < 0:
1427
+ for _ in range(abs(row_diff)):
1428
+ operations.append({"type": "press", "keys": ["ArrowUp"]})
1429
+ operations.append({"type": "wait", "delay": 50})
1430
+
1431
+ # Navigate horizontally
1432
+ if col_diff > 0:
1433
+ for _ in range(col_diff):
1434
+ operations.append(
1435
+ {"type": "press", "keys": ["ArrowRight"]}
1436
+ )
1437
+ operations.append({"type": "wait", "delay": 50})
1438
+ elif col_diff < 0:
1439
+ for _ in range(abs(col_diff)):
1440
+ operations.append({"type": "press", "keys": ["ArrowLeft"]})
1441
+ operations.append({"type": "wait", "delay": 50})
1442
+
1443
+ # Wait after navigation if moved
1444
+ if row_diff != 0 or col_diff != 0:
1445
+ operations.append({"type": "wait", "delay": 100})
1446
+
1447
+ # Clear and input
1448
+ operations.append({"type": "press", "keys": ["Delete"]})
1449
+ operations.append({"type": "wait", "delay": 120})
1450
+
1451
+ if text:
1452
+ operations.append({"type": "type", "text": text, "delay": 0})
1453
+ operations.append({"type": "wait", "delay": 120})
1454
+
1455
+ # Press Enter to confirm
1456
+ operations.append({"type": "press", "keys": ["Enter"]})
1457
+ operations.append({"type": "wait", "delay": 130})
1458
+
1459
+ # Update current position (after Enter, cursor moves to next row)
1460
+ current_row = target_row + 1
1461
+ current_col = target_col
1462
+
1463
+ try:
1464
+ await ws_wrapper._send_command(
1465
+ 'batch_keyboard_input',
1466
+ {'operations': operations, 'skipStabilityWait': True},
1467
+ )
1468
+ tab_info = await ws_wrapper.get_tab_info()
1469
+
1470
+ return {
1471
+ "result": f"Successfully input to {len(cells)} cells",
1472
+ "snapshot": "",
1473
+ "tabs": tab_info,
1474
+ "current_tab": next(
1475
+ (
1476
+ i
1477
+ for i, tab in enumerate(tab_info)
1478
+ if tab.get("is_current")
1479
+ ),
1480
+ 0,
1481
+ ),
1482
+ "total_tabs": len(tab_info),
1483
+ }
1484
+
1485
+ except Exception as e:
1486
+ logger.error(f"Batch keyboard execution failed: {e}")
1487
+ return {
1488
+ "result": f"Error in batch keyboard execution: {e}",
1489
+ "snapshot": "",
1490
+ "tabs": [],
1491
+ "current_tab": 0,
1492
+ "total_tabs": 0,
1493
+ }
1494
+
1495
+ def _trim_sheet_content(self, content: str) -> str:
1496
+ """Trim sheet content and add row/column labels.
1497
+
1498
+ Remove all empty rows and columns, then add:
1499
+ - Column headers: A, B, C, D...
1500
+ - Row numbers: 0, 1, 2, 3...
1501
+
1502
+ Args:
1503
+ content (str): Raw sheet content with tabs and newlines.
1504
+
1505
+ Returns:
1506
+ str: Trimmed content with row/column labels.
1507
+ """
1508
+ if not content or not content.strip():
1509
+ return ""
1510
+
1511
+ # Split into rows and parse into 2D array
1512
+ rows = content.split('\n')
1513
+ grid: List[List[str]] = []
1514
+ max_cols = 0
1515
+ for row_str in rows:
1516
+ cells = row_str.split('\t')
1517
+ grid.append(cells)
1518
+ max_cols = max(max_cols, len(cells))
1519
+
1520
+ # Pad rows to same length
1521
+ for row_list in grid:
1522
+ while len(row_list) < max_cols:
1523
+ row_list.append('')
1524
+
1525
+ if not grid:
1526
+ return ""
1527
+
1528
+ # Find non-empty rows and columns (keep original indices)
1529
+ non_empty_rows = []
1530
+ for i, row_cells in enumerate(grid):
1531
+ if any(cell.strip() for cell in row_cells):
1532
+ non_empty_rows.append(i)
1533
+
1534
+ non_empty_cols = []
1535
+ for j in range(max_cols):
1536
+ if any(grid[i][j].strip() for i in range(len(grid))):
1537
+ non_empty_cols.append(j)
1538
+
1539
+ # If no content found
1540
+ if not non_empty_rows or not non_empty_cols:
1541
+ return ""
1542
+
1543
+ # Extract non-empty rows and columns
1544
+ filtered_grid = []
1545
+ for i in non_empty_rows:
1546
+ filtered_row = [grid[i][j] for j in non_empty_cols]
1547
+ filtered_grid.append(filtered_row)
1548
+
1549
+ # Generate column labels using original column indices
1550
+ def col_label(index):
1551
+ label = ""
1552
+ while True:
1553
+ label = chr(65 + (index % 26)) + label
1554
+ index = index // 26
1555
+ if index == 0:
1556
+ break
1557
+ index -= 1
1558
+ return label
1559
+
1560
+ col_headers = [col_label(j) for j in non_empty_cols]
1561
+
1562
+ # Add column headers as first row
1563
+ result_rows = ['\t'.join(['', *col_headers])]
1564
+
1565
+ # Add data rows with original row numbers (0-based)
1566
+ for row_idx, row_data in zip(non_empty_rows, filtered_grid):
1567
+ result_rows.append('\t'.join([str(row_idx), *row_data]))
1568
+
1569
+ return '\n'.join(result_rows)
1570
+
1571
+ @high_level_action
1572
+ async def browser_sheet_read(self) -> Dict[str, Any]:
1573
+ r"""Read content from a spreadsheet.
1574
+
1575
+ This tool reads spreadsheet content and returns it in a structured
1576
+ format with row/column labels. Empty rows and columns are
1577
+ automatically removed.
1578
+
1579
+ Output format:
1580
+ - First row: Column labels (A, B, C, ..., Z, AA, AB, ...)
1581
+ - First column: Row numbers (0, 1, 2, 3, ...) - 0-based
1582
+ - Labels show ORIGINAL positions in the spreadsheet (before removing
1583
+ empty rows/columns)
1584
+
1585
+ Row/column indices match browser_sheet_input directly:
1586
+ - Row label "0" in output = row index 0 in browser_sheet_input
1587
+ - Column label "A" in output = col index 0 in browser_sheet_input
1588
+ - Column label "C" in output = col index 2 in browser_sheet_input
1589
+
1590
+ Returns:
1591
+ Dict[str, Any]: A dictionary with the result of the action:
1592
+ - "result" (str): Confirmation message.
1593
+ - "content" (str): Tab-separated spreadsheet content with
1594
+ row/column labels. Format:
1595
+ Line 1: "\tA\tB\tC" (column headers)
1596
+ Line 2+: "0\tdata1\tdata2\tdata3" (row number + data)
1597
+ - "snapshot" (str): Always empty string (sheet tools don't
1598
+ return snapshots).
1599
+ - "tabs" (List[Dict]): Information about all open tabs.
1600
+ - "current_tab" (int): Index of the active tab.
1601
+ - "total_tabs" (int): Total number of open tabs.
1602
+
1603
+ Example output:
1604
+ A B
1605
+ 0 Name Age
1606
+ 1 Alice 30
1607
+ 2 Bob 25
1608
+ """
1609
+ import platform
1610
+ import uuid
1611
+
1612
+ ws_wrapper = await self._get_ws_wrapper()
1613
+
1614
+ # Use unique ID to avoid conflicts in parallel execution
1615
+ request_id = str(uuid.uuid4())
1616
+ var_name = f"__sheetCopy_{request_id.replace('-', '_')}"
1617
+
1618
+ try:
1619
+ # Step 1: Setup copy interception with multiple captures
1620
+ js_inject = f"""
1621
+ window.{var_name} = [];
1622
+ let copyCount = 0;
1623
+ const copyListener = function(e) {{
1624
+ try {{
1625
+ // Intercept clipboard data before system clipboard write
1626
+ // Capture from Google Sheets' setData call
1627
+ const originalSetData = e.clipboardData.setData.bind(
1628
+ e.clipboardData
1629
+ );
1630
+ let capturedText = '';
1631
+
1632
+ e.clipboardData.setData = function(type, data) {{
1633
+ if (type === 'text/plain') {{
1634
+ capturedText = data;
1635
+ }}
1636
+ // Prevent system clipboard write
1637
+ }};
1638
+
1639
+ // Let Google Sheets process event (calls setData)
1640
+ // Event propagates and Sheets tries to set clipboard
1641
+ setTimeout(() => {{
1642
+ copyCount++;
1643
+ window.{var_name}.push(capturedText);
1644
+ }}, 0);
1645
+
1646
+ // Prevent the default browser copy behavior
1647
+ e.preventDefault();
1648
+ }} catch (err) {{
1649
+ console.error(
1650
+ '[SheetRead] Failed to intercept copy data:', err
1651
+ );
1652
+ }}
1653
+ }};
1654
+
1655
+ document.addEventListener('copy', copyListener, true);
1656
+ window.{var_name}_removeListener = () => {{
1657
+ document.removeEventListener('copy', copyListener, true);
1658
+ }};
1659
+
1660
+ 'Copy listener installed';
1661
+ """
1662
+ await ws_wrapper.console_exec(js_inject)
1663
+
1664
+ system = platform.system()
1665
+ import asyncio
1666
+
1667
+ if system == "Darwin":
1668
+ select_all_copy_ops: List[Dict[str, Any]] = [
1669
+ {"type": "press", "keys": ["Meta", "a"]},
1670
+ {"type": "wait", "delay": 100},
1671
+ {"type": "press", "keys": ["Meta", "c"]},
1672
+ ]
1673
+ await ws_wrapper._send_command(
1674
+ 'batch_keyboard_input',
1675
+ {
1676
+ 'operations': select_all_copy_ops,
1677
+ 'skipStabilityWait': True,
1678
+ },
1679
+ )
1680
+ await asyncio.sleep(0.2)
1681
+
1682
+ # Repeat to capture correct one
1683
+ await ws_wrapper._send_command(
1684
+ 'batch_keyboard_input',
1685
+ {
1686
+ 'operations': select_all_copy_ops,
1687
+ 'skipStabilityWait': True,
1688
+ },
1689
+ )
1690
+ await asyncio.sleep(0.2)
1691
+ else:
1692
+ select_all_copy_ops = [
1693
+ {"type": "press", "keys": ["Control", "a"]},
1694
+ {"type": "wait", "delay": 100},
1695
+ {"type": "press", "keys": ["Control", "c"]},
1696
+ ]
1697
+ await ws_wrapper._send_command(
1698
+ 'batch_keyboard_input',
1699
+ {
1700
+ 'operations': select_all_copy_ops,
1701
+ 'skipStabilityWait': True,
1702
+ },
1703
+ )
1704
+ await asyncio.sleep(0.2)
1705
+
1706
+ # Repeat to capture correct one
1707
+ await ws_wrapper._send_command(
1708
+ 'batch_keyboard_input',
1709
+ {
1710
+ 'operations': select_all_copy_ops,
1711
+ 'skipStabilityWait': True,
1712
+ },
1713
+ )
1714
+ await asyncio.sleep(0.2)
1715
+
1716
+ js_check = f"window.{var_name} || []"
1717
+ content_result = await ws_wrapper.console_exec(js_check)
1718
+ result_str = content_result.get("result", "[]")
1719
+
1720
+ import json
1721
+
1722
+ if isinstance(result_str, list):
1723
+ captured_contents = result_str
1724
+ elif isinstance(result_str, str):
1725
+ if result_str.startswith("Console execution result: "):
1726
+ result_str = result_str[
1727
+ len("Console execution result: ") :
1728
+ ]
1729
+ result_str = result_str.strip()
1730
+
1731
+ try:
1732
+ captured_contents = json.loads(result_str)
1733
+ except json.JSONDecodeError:
1734
+ captured_contents = []
1735
+ else:
1736
+ captured_contents = []
1737
+
1738
+ if not captured_contents:
1739
+ sheet_content = ""
1740
+ elif len(captured_contents) == 1:
1741
+ sheet_content = captured_contents[0]
1742
+ else:
1743
+
1744
+ def count_non_empty_cells(content):
1745
+ if not content:
1746
+ return 0
1747
+ count = 0
1748
+ for line in content.split('\n'):
1749
+ for cell in line.split('\t'):
1750
+ if cell.strip():
1751
+ count += 1
1752
+ return count
1753
+
1754
+ counts = [
1755
+ count_non_empty_cells(content)
1756
+ for content in captured_contents[:2]
1757
+ ]
1758
+ best_idx = 0 if counts[0] > counts[1] else 1
1759
+ sheet_content = captured_contents[best_idx]
1760
+
1761
+ sheet_content = self._trim_sheet_content(sheet_content)
1762
+
1763
+ tab_info = await ws_wrapper.get_tab_info()
1764
+
1765
+ return {
1766
+ "result": "Successfully read spreadsheet content",
1767
+ "content": sheet_content,
1768
+ "snapshot": "", # Sheet tools don't return snapshots
1769
+ "tabs": tab_info,
1770
+ "current_tab": next(
1771
+ (
1772
+ i
1773
+ for i, tab in enumerate(tab_info)
1774
+ if tab.get("is_current")
1775
+ ),
1776
+ 0,
1777
+ ),
1778
+ "total_tabs": len(tab_info),
1779
+ }
1780
+
1781
+ except Exception as e:
1782
+ logger.error(f"Failed to read sheet: {e}")
1783
+ return {
1784
+ "result": f"Error reading sheet: {e}",
1785
+ "content": "",
1786
+ "snapshot": "",
1787
+ "tabs": [],
1788
+ "current_tab": 0,
1789
+ "total_tabs": 0,
1790
+ }
1791
+ finally:
1792
+ js_cleanup = f"""
1793
+ if (window.{var_name}_removeListener) {{
1794
+ window.{var_name}_removeListener();
1795
+ }}
1796
+ delete window.{var_name};
1797
+ delete window.{var_name}_removeListener;
1798
+ 'cleaned'
1799
+ """
1800
+ with contextlib.suppress(Exception):
1801
+ await ws_wrapper.console_exec(js_cleanup)
1802
+
1005
1803
  # Additional methods for backward compatibility
1006
1804
  async def browser_wait_user(
1007
1805
  self, timeout_sec: Optional[float] = None
@@ -1113,7 +1911,6 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
1113
1911
  headless=self._headless,
1114
1912
  user_data_dir=self._user_data_dir,
1115
1913
  stealth=self._stealth,
1116
- web_agent_model=self._web_agent_model,
1117
1914
  cache_dir=f"{self._cache_dir.rstrip('/')}_clone_"
1118
1915
  f"{new_session_id}/",
1119
1916
  enabled_tools=self.enabled_tools.copy(),
@@ -1127,6 +1924,8 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
1127
1924
  screenshot_timeout=self._screenshot_timeout,
1128
1925
  page_stability_timeout=self._page_stability_timeout,
1129
1926
  dom_content_loaded_timeout=self._dom_content_loaded_timeout,
1927
+ viewport_limit=self._viewport_limit,
1928
+ full_visual_mode=self._full_visual_mode,
1130
1929
  )
1131
1930
 
1132
1931
  def get_tools(self) -> List[FunctionTool]:
@@ -1146,25 +1945,22 @@ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
1146
1945
  "browser_select": self.browser_select,
1147
1946
  "browser_scroll": self.browser_scroll,
1148
1947
  "browser_enter": self.browser_enter,
1948
+ "browser_mouse_control": self.browser_mouse_control,
1949
+ "browser_mouse_drag": self.browser_mouse_drag,
1950
+ "browser_press_key": self.browser_press_key,
1149
1951
  "browser_wait_user": self.browser_wait_user,
1150
1952
  "browser_switch_tab": self.browser_switch_tab,
1151
1953
  "browser_close_tab": self.browser_close_tab,
1152
1954
  "browser_get_tab_info": self.browser_get_tab_info,
1955
+ "browser_console_view": self.browser_console_view,
1956
+ "browser_console_exec": self.browser_console_exec,
1957
+ "browser_sheet_input": self.browser_sheet_input,
1958
+ "browser_sheet_read": self.browser_sheet_read,
1153
1959
  }
1154
1960
 
1155
1961
  enabled_tools = []
1156
1962
 
1157
1963
  for tool_name in self.enabled_tools:
1158
- if (
1159
- tool_name == "browser_solve_task"
1160
- and self._web_agent_model is None
1161
- ):
1162
- logger.warning(
1163
- f"Tool '{tool_name}' is enabled but web_agent_model "
1164
- f"is not provided. Skipping this tool."
1165
- )
1166
- continue
1167
-
1168
1964
  if tool_name in tool_map:
1169
1965
  tool = FunctionTool(
1170
1966
  cast(Callable[..., Any], tool_map[tool_name])