cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Browser Tool for agent interactions.
|
|
3
|
+
Allows agents to control a browser programmatically via Playwright.
|
|
4
|
+
Implements the computer_use action interface for comprehensive browser control.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
10
|
+
|
|
11
|
+
from .base import BaseComputerTool, register_tool
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from computer.interface import GenericComputerInterface
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@register_tool("computer_use")
|
|
20
|
+
class BrowserTool(BaseComputerTool):
|
|
21
|
+
"""
|
|
22
|
+
Browser tool that uses the computer SDK's interface to control a browser.
|
|
23
|
+
Implements a comprehensive computer_use action interface for browser control.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, interface: "GenericComputerInterface", cfg: Optional[dict] = None):
|
|
27
|
+
"""
|
|
28
|
+
Initialize the BrowserTool.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
interface: A GenericComputerInterface instance that provides playwright_exec
|
|
32
|
+
cfg: Optional configuration dictionary
|
|
33
|
+
"""
|
|
34
|
+
self.interface = interface
|
|
35
|
+
self._facts = [] # Store memorized facts
|
|
36
|
+
|
|
37
|
+
# Get initial screenshot to determine dimensions
|
|
38
|
+
self.viewport_width = None
|
|
39
|
+
self.viewport_height = None
|
|
40
|
+
self.resized_width = None
|
|
41
|
+
self.resized_height = None
|
|
42
|
+
|
|
43
|
+
# Try to initialize dimensions synchronously
|
|
44
|
+
try:
|
|
45
|
+
import asyncio
|
|
46
|
+
|
|
47
|
+
loop = asyncio.get_event_loop()
|
|
48
|
+
if loop.is_running():
|
|
49
|
+
# If we're in an async context, dimensions will be lazy-loaded
|
|
50
|
+
pass
|
|
51
|
+
else:
|
|
52
|
+
loop.run_until_complete(self._initialize_dimensions())
|
|
53
|
+
except Exception:
|
|
54
|
+
# Dimensions will be lazy-loaded on first use
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
super().__init__(cfg)
|
|
58
|
+
|
|
59
|
+
async def _initialize_dimensions(self):
|
|
60
|
+
"""Initialize viewport and resized dimensions from screenshot."""
|
|
61
|
+
try:
|
|
62
|
+
import base64
|
|
63
|
+
import io
|
|
64
|
+
|
|
65
|
+
from PIL import Image
|
|
66
|
+
from qwen_vl_utils import smart_resize
|
|
67
|
+
|
|
68
|
+
# Take a screenshot to get actual dimensions
|
|
69
|
+
screenshot_b64 = await self.screenshot()
|
|
70
|
+
img_bytes = base64.b64decode(screenshot_b64)
|
|
71
|
+
im = Image.open(io.BytesIO(img_bytes))
|
|
72
|
+
|
|
73
|
+
# Store actual viewport size
|
|
74
|
+
self.viewport_width = im.width
|
|
75
|
+
self.viewport_height = im.height
|
|
76
|
+
|
|
77
|
+
# Calculate resized dimensions using smart_resize with factor=28
|
|
78
|
+
MIN_PIXELS = 3136
|
|
79
|
+
MAX_PIXELS = 12845056
|
|
80
|
+
rh, rw = smart_resize(
|
|
81
|
+
im.height, im.width, factor=28, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
|
|
82
|
+
)
|
|
83
|
+
self.resized_width = rw
|
|
84
|
+
self.resized_height = rh
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
# Fall back to defaults if initialization fails
|
|
88
|
+
logger.warning(f"Failed to initialize dimensions: {e}")
|
|
89
|
+
self.viewport_width = 1024
|
|
90
|
+
self.viewport_height = 768
|
|
91
|
+
self.resized_width = 1024
|
|
92
|
+
self.resized_height = 768
|
|
93
|
+
|
|
94
|
+
async def _proc_coords(self, x: float, y: float) -> tuple:
|
|
95
|
+
"""
|
|
96
|
+
Process coordinates by converting from resized space to viewport space.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
x: X coordinate in resized space (0 to resized_width)
|
|
100
|
+
y: Y coordinate in resized space (0 to resized_height)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Tuple of (viewport_x, viewport_y) in actual viewport pixels
|
|
104
|
+
"""
|
|
105
|
+
# Ensure dimensions are initialized
|
|
106
|
+
if self.resized_width is None or self.resized_height is None:
|
|
107
|
+
await self._initialize_dimensions()
|
|
108
|
+
|
|
109
|
+
# Convert from resized space to viewport space
|
|
110
|
+
# Normalize by resized dimensions, then scale to viewport dimensions
|
|
111
|
+
viewport_x = (x / self.resized_width) * self.viewport_width
|
|
112
|
+
viewport_y = (y / self.resized_height) * self.viewport_height
|
|
113
|
+
|
|
114
|
+
return int(round(viewport_x)), int(round(viewport_y))
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def description(self) -> str:
|
|
118
|
+
# Use resized dimensions if available, otherwise use defaults
|
|
119
|
+
width = self.resized_width if self.resized_width is not None else 1024
|
|
120
|
+
height = self.resized_height if self.resized_height is not None else 768
|
|
121
|
+
|
|
122
|
+
return f"Use a mouse and keyboard to interact with a computer, and take screenshots.\
|
|
123
|
+
* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\
|
|
124
|
+
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\
|
|
125
|
+
* The screen's resolution is {width}x{height}.\
|
|
126
|
+
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\
|
|
127
|
+
* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\
|
|
128
|
+
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\
|
|
129
|
+
* When a separate scrollable container prominently overlays the webpage, if you want to scroll within it, you typically need to mouse_move() over it first and then scroll().\
|
|
130
|
+
* If a popup window appears that you want to close, if left_click() on the 'X' or close button doesn't work, try key(keys=['Escape']) to close it.\
|
|
131
|
+
* On some search bars, when you type(), you may need to press_enter=False and instead separately call left_click() on the search button to submit the search query. This is especially true of search bars that have auto-suggest popups for e.g. locations\
|
|
132
|
+
* For calendar widgets, you usually need to left_click() on arrows to move between months and left_click() on dates to select them; type() is not typically used to input dates there.".strip()
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def parameters(self) -> dict:
|
|
136
|
+
return {
|
|
137
|
+
"type": "object",
|
|
138
|
+
"properties": {
|
|
139
|
+
"action": {
|
|
140
|
+
"description": """The action to perform. The available actions are:
|
|
141
|
+
* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order. Includes 'Enter', 'Alt', 'Shift', 'Tab', 'Control', 'Backspace', 'Delete', 'Escape', 'ArrowUp', 'ArrowDown', 'ArrowLeft', 'ArrowRight', 'PageDown', 'PageUp', 'Shift', etc.
|
|
142
|
+
* type: Type a string of text on the keyboard.
|
|
143
|
+
* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.
|
|
144
|
+
* left_click: Click the left mouse button.
|
|
145
|
+
* scroll: Performs a scroll of the mouse scroll wheel.
|
|
146
|
+
* visit_url: Visit a specified URL.
|
|
147
|
+
* web_search: Perform a web search with a specified query.
|
|
148
|
+
* history_back: Go back to the previous page in the browser history.
|
|
149
|
+
* pause_and_memorize_fact: Pause and memorize a fact for future reference.
|
|
150
|
+
* wait: Wait specified seconds for the change to happen.
|
|
151
|
+
* terminate: Terminate the current task and report its completion status.""",
|
|
152
|
+
"enum": [
|
|
153
|
+
"key",
|
|
154
|
+
"type",
|
|
155
|
+
"mouse_move",
|
|
156
|
+
"left_click",
|
|
157
|
+
"scroll",
|
|
158
|
+
"visit_url",
|
|
159
|
+
"web_search",
|
|
160
|
+
"history_back",
|
|
161
|
+
"pause_and_memorize_fact",
|
|
162
|
+
"wait",
|
|
163
|
+
"terminate",
|
|
164
|
+
],
|
|
165
|
+
"type": "string",
|
|
166
|
+
},
|
|
167
|
+
"keys": {"description": "Required only by action=key.", "type": "array"},
|
|
168
|
+
"text": {"description": "Required only by action=type.", "type": "string"},
|
|
169
|
+
"coordinate": {
|
|
170
|
+
"description": "(x, y) coordinates for mouse actions. Required only by action=left_click, action=mouse_move, and action=type.",
|
|
171
|
+
"type": "array",
|
|
172
|
+
},
|
|
173
|
+
"pixels": {
|
|
174
|
+
"description": "Amount of scrolling. Positive = up, Negative = down. Required only by action=scroll.",
|
|
175
|
+
"type": "number",
|
|
176
|
+
},
|
|
177
|
+
"url": {
|
|
178
|
+
"description": "The URL to visit. Required only by action=visit_url.",
|
|
179
|
+
"type": "string",
|
|
180
|
+
},
|
|
181
|
+
"query": {
|
|
182
|
+
"description": "The query to search for. Required only by action=web_search.",
|
|
183
|
+
"type": "string",
|
|
184
|
+
},
|
|
185
|
+
"fact": {
|
|
186
|
+
"description": "The fact to remember for the future. Required only by action=pause_and_memorize_fact.",
|
|
187
|
+
"type": "string",
|
|
188
|
+
},
|
|
189
|
+
"time": {
|
|
190
|
+
"description": "Seconds to wait. Required only by action=wait.",
|
|
191
|
+
"type": "number",
|
|
192
|
+
},
|
|
193
|
+
"status": {
|
|
194
|
+
"description": "Status of the task. Required only by action=terminate.",
|
|
195
|
+
"type": "string",
|
|
196
|
+
"enum": ["success", "failure"],
|
|
197
|
+
},
|
|
198
|
+
},
|
|
199
|
+
"required": ["action"],
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
def call(self, params: Union[str, dict], **kwargs) -> Union[str, dict]:
|
|
203
|
+
"""
|
|
204
|
+
Execute a browser action.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
params: Action parameters (JSON string or dict)
|
|
208
|
+
**kwargs: Additional keyword arguments
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Result of the action execution
|
|
212
|
+
"""
|
|
213
|
+
# Verify and parse parameters
|
|
214
|
+
params_dict = self._verify_json_format_args(params)
|
|
215
|
+
action = params_dict.get("action")
|
|
216
|
+
|
|
217
|
+
if not action:
|
|
218
|
+
return {"success": False, "error": "action parameter is required"}
|
|
219
|
+
|
|
220
|
+
# Execute action synchronously by running async method in event loop
|
|
221
|
+
try:
|
|
222
|
+
loop = asyncio.get_event_loop()
|
|
223
|
+
if loop.is_running():
|
|
224
|
+
# If we're already in an async context, we can't use run_until_complete
|
|
225
|
+
# Create a task and wait for it
|
|
226
|
+
import concurrent.futures
|
|
227
|
+
|
|
228
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
229
|
+
future = executor.submit(asyncio.run, self._execute_action(action, params_dict))
|
|
230
|
+
result = future.result()
|
|
231
|
+
else:
|
|
232
|
+
result = loop.run_until_complete(self._execute_action(action, params_dict))
|
|
233
|
+
return result
|
|
234
|
+
except Exception as e:
|
|
235
|
+
logger.error(f"Error executing action {action}: {e}")
|
|
236
|
+
return {"success": False, "error": str(e)}
|
|
237
|
+
|
|
238
|
+
async def _execute_action(self, action: str, params: dict) -> dict:
|
|
239
|
+
"""Execute the specific action asynchronously."""
|
|
240
|
+
try:
|
|
241
|
+
if action == "key":
|
|
242
|
+
return await self._action_key(params)
|
|
243
|
+
elif action == "type":
|
|
244
|
+
return await self._action_type(params)
|
|
245
|
+
elif action == "mouse_move":
|
|
246
|
+
return await self._action_mouse_move(params)
|
|
247
|
+
elif action == "left_click":
|
|
248
|
+
return await self._action_left_click(params)
|
|
249
|
+
elif action == "scroll":
|
|
250
|
+
return await self._action_scroll(params)
|
|
251
|
+
elif action == "visit_url":
|
|
252
|
+
return await self._action_visit_url(params)
|
|
253
|
+
elif action == "web_search":
|
|
254
|
+
return await self._action_web_search(params)
|
|
255
|
+
elif action == "history_back":
|
|
256
|
+
return await self._action_history_back(params)
|
|
257
|
+
elif action == "pause_and_memorize_fact":
|
|
258
|
+
return await self._action_pause_and_memorize_fact(params)
|
|
259
|
+
elif action == "wait":
|
|
260
|
+
return await self._action_wait(params)
|
|
261
|
+
elif action == "terminate":
|
|
262
|
+
return await self._action_terminate(params)
|
|
263
|
+
else:
|
|
264
|
+
return {"success": False, "error": f"Unknown action: {action}"}
|
|
265
|
+
except Exception as e:
|
|
266
|
+
logger.error(f"Error in action {action}: {e}")
|
|
267
|
+
return {"success": False, "error": str(e)}
|
|
268
|
+
|
|
269
|
+
async def _action_key(self, params: dict) -> dict:
|
|
270
|
+
"""Press keys in sequence."""
|
|
271
|
+
keys = params.get("keys", [])
|
|
272
|
+
if not keys:
|
|
273
|
+
return {"success": False, "error": "keys parameter is required"}
|
|
274
|
+
|
|
275
|
+
# Convert keys to proper format and press via hotkey
|
|
276
|
+
try:
|
|
277
|
+
await self.interface.interface.hotkey(*keys)
|
|
278
|
+
return {"success": True, "message": f"Pressed keys: {keys}"}
|
|
279
|
+
except Exception as e:
|
|
280
|
+
return {"success": False, "error": str(e)}
|
|
281
|
+
|
|
282
|
+
async def _action_type(self, params: dict) -> dict:
|
|
283
|
+
"""Type text."""
|
|
284
|
+
text = params.get("text")
|
|
285
|
+
if not text:
|
|
286
|
+
return {"success": False, "error": "text parameter is required"}
|
|
287
|
+
|
|
288
|
+
# If coordinate is provided, click there first
|
|
289
|
+
coordinate = params.get("coordinate")
|
|
290
|
+
if coordinate and len(coordinate) == 2:
|
|
291
|
+
await self.interface.playwright_exec("click", {"x": coordinate[0], "y": coordinate[1]})
|
|
292
|
+
|
|
293
|
+
result = await self.interface.playwright_exec("type", {"text": text})
|
|
294
|
+
return result
|
|
295
|
+
|
|
296
|
+
async def _action_mouse_move(self, params: dict) -> dict:
|
|
297
|
+
"""Move mouse to coordinates."""
|
|
298
|
+
coordinate = params.get("coordinate")
|
|
299
|
+
if not coordinate or len(coordinate) != 2:
|
|
300
|
+
return {"success": False, "error": "coordinate parameter [x, y] is required"}
|
|
301
|
+
|
|
302
|
+
await self.interface.interface.move_cursor(coordinate[0], coordinate[1])
|
|
303
|
+
return {"success": True, "message": f"Moved cursor to {coordinate}"}
|
|
304
|
+
|
|
305
|
+
async def _action_left_click(self, params: dict) -> dict:
|
|
306
|
+
"""Click at coordinates."""
|
|
307
|
+
coordinate = params.get("coordinate")
|
|
308
|
+
if not coordinate or len(coordinate) != 2:
|
|
309
|
+
return {"success": False, "error": "coordinate parameter [x, y] is required"}
|
|
310
|
+
|
|
311
|
+
result = await self.interface.playwright_exec(
|
|
312
|
+
"click", {"x": coordinate[0], "y": coordinate[1]}
|
|
313
|
+
)
|
|
314
|
+
return result
|
|
315
|
+
|
|
316
|
+
async def _action_scroll(self, params: dict) -> dict:
|
|
317
|
+
"""Scroll the page."""
|
|
318
|
+
pixels = params.get("pixels", 0)
|
|
319
|
+
if pixels == 0:
|
|
320
|
+
return {"success": False, "error": "pixels parameter is required"}
|
|
321
|
+
|
|
322
|
+
# Positive = up (negative delta_y), Negative = down (positive delta_y)
|
|
323
|
+
result = await self.interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -pixels})
|
|
324
|
+
return result
|
|
325
|
+
|
|
326
|
+
async def _action_visit_url(self, params: dict) -> dict:
|
|
327
|
+
"""Visit a URL."""
|
|
328
|
+
url = params.get("url")
|
|
329
|
+
if not url:
|
|
330
|
+
return {"success": False, "error": "url parameter is required"}
|
|
331
|
+
|
|
332
|
+
result = await self.interface.playwright_exec("visit_url", {"url": url})
|
|
333
|
+
return result
|
|
334
|
+
|
|
335
|
+
async def _action_web_search(self, params: dict) -> dict:
|
|
336
|
+
"""Perform web search."""
|
|
337
|
+
query = params.get("query")
|
|
338
|
+
if not query:
|
|
339
|
+
return {"success": False, "error": "query parameter is required"}
|
|
340
|
+
|
|
341
|
+
result = await self.interface.playwright_exec("web_search", {"query": query})
|
|
342
|
+
return result
|
|
343
|
+
|
|
344
|
+
async def _action_history_back(self, params: dict) -> dict:
|
|
345
|
+
"""Go back in browser history."""
|
|
346
|
+
# Press Alt+Left arrow key combination
|
|
347
|
+
try:
|
|
348
|
+
await self.interface.interface.hotkey("Alt", "ArrowLeft")
|
|
349
|
+
return {"success": True, "message": "Navigated back in history"}
|
|
350
|
+
except Exception as e:
|
|
351
|
+
return {"success": False, "error": str(e)}
|
|
352
|
+
|
|
353
|
+
async def _action_pause_and_memorize_fact(self, params: dict) -> dict:
|
|
354
|
+
"""Memorize a fact."""
|
|
355
|
+
fact = params.get("fact")
|
|
356
|
+
if not fact:
|
|
357
|
+
return {"success": False, "error": "fact parameter is required"}
|
|
358
|
+
|
|
359
|
+
self._facts.append(fact)
|
|
360
|
+
return {
|
|
361
|
+
"success": True,
|
|
362
|
+
"message": f"Memorized fact: {fact}",
|
|
363
|
+
"total_facts": len(self._facts),
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
async def _action_wait(self, params: dict) -> dict:
|
|
367
|
+
"""Wait for specified seconds."""
|
|
368
|
+
time = params.get("time", 0)
|
|
369
|
+
if time <= 0:
|
|
370
|
+
return {"success": False, "error": "time parameter must be positive"}
|
|
371
|
+
|
|
372
|
+
await asyncio.sleep(time)
|
|
373
|
+
return {"success": True, "message": f"Waited {time} seconds"}
|
|
374
|
+
|
|
375
|
+
async def _action_terminate(self, params: dict) -> dict:
|
|
376
|
+
"""Terminate and report status."""
|
|
377
|
+
status = params.get("status", "success")
|
|
378
|
+
message = f"Task terminated with status: {status}"
|
|
379
|
+
|
|
380
|
+
if self._facts:
|
|
381
|
+
message += f"\nMemorized facts: {self._facts}"
|
|
382
|
+
|
|
383
|
+
return {"success": True, "status": status, "message": message, "terminated": True}
|
|
384
|
+
|
|
385
|
+
# Legacy methods for backward compatibility
|
|
386
|
+
async def visit_url(self, url: str) -> dict:
|
|
387
|
+
"""Navigate to a URL."""
|
|
388
|
+
return await self._action_visit_url({"url": url})
|
|
389
|
+
|
|
390
|
+
async def click(self, x: int, y: int) -> dict:
|
|
391
|
+
"""Click at coordinates."""
|
|
392
|
+
return await self._action_left_click({"coordinate": [x, y]})
|
|
393
|
+
|
|
394
|
+
async def type(self, text: str) -> dict:
|
|
395
|
+
"""Type text into the focused element."""
|
|
396
|
+
return await self._action_type({"text": text})
|
|
397
|
+
|
|
398
|
+
async def scroll(self, delta_x: int, delta_y: int) -> dict:
|
|
399
|
+
"""Scroll the page."""
|
|
400
|
+
return await self._action_scroll({"pixels": -delta_y})
|
|
401
|
+
|
|
402
|
+
async def web_search(self, query: str) -> dict:
|
|
403
|
+
"""Navigate to a Google search for the query."""
|
|
404
|
+
return await self._action_web_search({"query": query})
|
|
405
|
+
|
|
406
|
+
async def screenshot(self) -> str:
|
|
407
|
+
"""Take a screenshot of the current browser page."""
|
|
408
|
+
result = await self.interface.playwright_exec("screenshot", {})
|
|
409
|
+
if result.get("success") and result.get("screenshot"):
|
|
410
|
+
screenshot_b64 = result["screenshot"]
|
|
411
|
+
return screenshot_b64
|
|
412
|
+
else:
|
|
413
|
+
error = result.get("error", "Unknown error")
|
|
414
|
+
raise RuntimeError(f"Failed to take screenshot: {error}")
|
|
415
|
+
|
|
416
|
+
async def get_current_url(self) -> str:
|
|
417
|
+
"""Get the current URL of the browser page."""
|
|
418
|
+
result = await self.interface.playwright_exec("get_current_url", {})
|
|
419
|
+
if result.get("success") and result.get("url"):
|
|
420
|
+
return result["url"]
|
|
421
|
+
else:
|
|
422
|
+
error = result.get("error", "Unknown error")
|
|
423
|
+
raise RuntimeError(f"Failed to get current URL: {error}")
|
agent/types.py
CHANGED
|
@@ -2,37 +2,43 @@
|
|
|
2
2
|
Type definitions for agent
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
|
|
6
|
-
from pydantic import BaseModel
|
|
7
5
|
import re
|
|
8
|
-
from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
|
|
9
6
|
from collections.abc import Iterable
|
|
7
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Protocol
|
|
8
|
+
|
|
9
|
+
from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
|
|
10
|
+
from pydantic import BaseModel
|
|
10
11
|
|
|
11
12
|
# Agent input types
|
|
12
13
|
Messages = str | ResponseInputParam | List[Dict[str, Any]]
|
|
13
14
|
Tools = Optional[Iterable[ToolParam]]
|
|
14
15
|
|
|
15
16
|
# Agent output types
|
|
16
|
-
AgentResponse = ResponsesAPIResponse
|
|
17
|
+
AgentResponse = ResponsesAPIResponse
|
|
17
18
|
AgentCapability = Literal["step", "click"]
|
|
18
19
|
|
|
20
|
+
|
|
19
21
|
# Exception types
|
|
20
22
|
class ToolError(RuntimeError):
|
|
21
23
|
"""Base exception for tool-related errors"""
|
|
24
|
+
|
|
22
25
|
pass
|
|
23
26
|
|
|
27
|
+
|
|
24
28
|
class IllegalArgumentError(ToolError):
|
|
25
29
|
"""Exception raised when function arguments are invalid"""
|
|
30
|
+
|
|
26
31
|
pass
|
|
27
32
|
|
|
28
33
|
|
|
29
34
|
# Agent config registration
|
|
30
35
|
class AgentConfigInfo(BaseModel):
|
|
31
36
|
"""Information about a registered agent config"""
|
|
37
|
+
|
|
32
38
|
agent_class: type
|
|
33
39
|
models_regex: str
|
|
34
40
|
priority: int = 0
|
|
35
|
-
|
|
41
|
+
|
|
36
42
|
def matches_model(self, model: str) -> bool:
|
|
37
43
|
"""Check if this agent config matches the given model"""
|
|
38
44
|
return bool(re.match(self.models_regex, model))
|
agent/ui/__init__.py
CHANGED
agent/ui/__main__.py
CHANGED
agent/ui/gradio/app.py
CHANGED
|
@@ -6,9 +6,9 @@ with an advanced UI for model selection and configuration.
|
|
|
6
6
|
|
|
7
7
|
Supported Agent Models:
|
|
8
8
|
- OpenAI: openai/computer-use-preview
|
|
9
|
-
- Anthropic: anthropic/claude-
|
|
9
|
+
- Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
|
|
10
10
|
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
|
11
|
-
- Omniparser: omniparser+anthropic/claude-
|
|
11
|
+
- Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3
|
|
12
12
|
|
|
13
13
|
Requirements:
|
|
14
14
|
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
|
|
@@ -18,21 +18,21 @@ Requirements:
|
|
|
18
18
|
- OpenAI or Anthropic API key
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
import os
|
|
22
21
|
import asyncio
|
|
23
|
-
import logging
|
|
24
22
|
import json
|
|
23
|
+
import logging
|
|
24
|
+
import os
|
|
25
25
|
import platform
|
|
26
26
|
from pathlib import Path
|
|
27
|
-
from typing import
|
|
27
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union, cast
|
|
28
|
+
|
|
28
29
|
import gradio as gr
|
|
29
|
-
from gradio.components.chatbot import MetadataDict
|
|
30
|
-
from typing import cast
|
|
31
30
|
|
|
32
31
|
# Import from agent package
|
|
33
32
|
from agent import ComputerAgent
|
|
34
|
-
from agent.types import
|
|
33
|
+
from agent.types import AgentResponse, Messages
|
|
35
34
|
from computer import Computer
|
|
35
|
+
from gradio.components.chatbot import MetadataDict
|
|
36
36
|
|
|
37
37
|
# Global variables
|
|
38
38
|
global_agent = None
|
|
@@ -42,11 +42,13 @@ SETTINGS_FILE = Path(".gradio_settings.json")
|
|
|
42
42
|
logging.basicConfig(level=logging.INFO)
|
|
43
43
|
|
|
44
44
|
import dotenv
|
|
45
|
+
|
|
45
46
|
if dotenv.load_dotenv():
|
|
46
47
|
print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
|
|
47
48
|
else:
|
|
48
49
|
print("DEBUG - No .env file found")
|
|
49
50
|
|
|
51
|
+
|
|
50
52
|
# --- Settings Load/Save Functions ---
|
|
51
53
|
def load_settings() -> Dict[str, Any]:
|
|
52
54
|
"""Loads settings from the JSON file."""
|
|
@@ -84,7 +86,7 @@ def save_settings(settings: Dict[str, Any]):
|
|
|
84
86
|
# async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
|
|
85
87
|
# """Add screenshot to chatbot when a screenshot is taken."""
|
|
86
88
|
# image_markdown = f""
|
|
87
|
-
|
|
89
|
+
|
|
88
90
|
# if self.chatbot_history is not None:
|
|
89
91
|
# self.chatbot_history.append(
|
|
90
92
|
# gr.ChatMessage(
|
|
@@ -114,14 +116,12 @@ MODEL_MAPPINGS = {
|
|
|
114
116
|
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
|
|
115
117
|
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
|
|
116
118
|
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
|
|
117
|
-
"Anthropic: Claude 3.5 Sonnet (20241022)": "anthropic/claude-3-5-sonnet-20241022",
|
|
118
119
|
},
|
|
119
120
|
"omni": {
|
|
120
121
|
"default": "omniparser+openai/gpt-4o",
|
|
121
122
|
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
|
|
122
123
|
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
|
|
123
124
|
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
|
|
124
|
-
"OMNI: Claude 3.5 Sonnet (20241022)": "omniparser+anthropic/claude-3-5-sonnet-20241022",
|
|
125
125
|
},
|
|
126
126
|
"uitars": {
|
|
127
127
|
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
|
|
@@ -141,7 +141,7 @@ def get_model_string(model_name: str, loop_provider: str) -> str:
|
|
|
141
141
|
ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
|
|
142
142
|
return f"omniparser+ollama_chat/{ollama_model}"
|
|
143
143
|
return "omniparser+ollama_chat/llama3"
|
|
144
|
-
|
|
144
|
+
|
|
145
145
|
# Map based on loop provider
|
|
146
146
|
mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
|
|
147
147
|
return mapping.get(model_name, mapping["default"])
|
|
@@ -151,6 +151,7 @@ def get_ollama_models() -> List[str]:
|
|
|
151
151
|
"""Get available models from Ollama if installed."""
|
|
152
152
|
try:
|
|
153
153
|
import subprocess
|
|
154
|
+
|
|
154
155
|
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
|
|
155
156
|
if result.returncode == 0:
|
|
156
157
|
lines = result.stdout.strip().split("\n")
|
|
@@ -174,16 +175,14 @@ def create_computer_instance(
|
|
|
174
175
|
os_type: str = "macos",
|
|
175
176
|
provider_type: str = "lume",
|
|
176
177
|
name: Optional[str] = None,
|
|
177
|
-
api_key: Optional[str] = None
|
|
178
|
+
api_key: Optional[str] = None,
|
|
178
179
|
) -> Computer:
|
|
179
180
|
"""Create or get the global Computer instance."""
|
|
180
181
|
global global_computer
|
|
181
182
|
if global_computer is None:
|
|
182
183
|
if provider_type == "localhost":
|
|
183
184
|
global_computer = Computer(
|
|
184
|
-
verbosity=verbosity,
|
|
185
|
-
os_type=os_type,
|
|
186
|
-
use_host_computer_server=True
|
|
185
|
+
verbosity=verbosity, os_type=os_type, use_host_computer_server=True
|
|
187
186
|
)
|
|
188
187
|
else:
|
|
189
188
|
global_computer = Computer(
|
|
@@ -191,7 +190,7 @@ def create_computer_instance(
|
|
|
191
190
|
os_type=os_type,
|
|
192
191
|
provider_type=provider_type,
|
|
193
192
|
name=name if name else "",
|
|
194
|
-
api_key=api_key
|
|
193
|
+
api_key=api_key,
|
|
195
194
|
)
|
|
196
195
|
return global_computer
|
|
197
196
|
|
|
@@ -217,7 +216,7 @@ def create_agent(
|
|
|
217
216
|
os_type=computer_os,
|
|
218
217
|
provider_type=computer_provider,
|
|
219
218
|
name=computer_name,
|
|
220
|
-
api_key=computer_api_key
|
|
219
|
+
api_key=computer_api_key,
|
|
221
220
|
)
|
|
222
221
|
|
|
223
222
|
# Handle custom models
|
|
@@ -233,12 +232,15 @@ def create_agent(
|
|
|
233
232
|
"only_n_most_recent_images": only_n_most_recent_images,
|
|
234
233
|
"verbosity": verbosity,
|
|
235
234
|
}
|
|
236
|
-
|
|
235
|
+
|
|
237
236
|
if save_trajectory:
|
|
238
237
|
agent_kwargs["trajectory_dir"] = "trajectories"
|
|
239
|
-
|
|
238
|
+
|
|
240
239
|
if max_trajectory_budget:
|
|
241
|
-
agent_kwargs["max_trajectory_budget"] = {
|
|
240
|
+
agent_kwargs["max_trajectory_budget"] = {
|
|
241
|
+
"max_budget": max_trajectory_budget,
|
|
242
|
+
"raise_error": True,
|
|
243
|
+
}
|
|
242
244
|
|
|
243
245
|
global_agent = ComputerAgent(**agent_kwargs)
|
|
244
246
|
return global_agent
|
|
@@ -247,7 +249,8 @@ def create_agent(
|
|
|
247
249
|
def launch_ui():
|
|
248
250
|
"""Standalone function to launch the Gradio app."""
|
|
249
251
|
from agent.ui.gradio.ui_components import create_gradio_ui
|
|
250
|
-
|
|
252
|
+
|
|
253
|
+
print("Starting Gradio app for Cua Agent...")
|
|
251
254
|
demo = create_gradio_ui()
|
|
252
255
|
demo.launch(share=False, inbrowser=True)
|
|
253
256
|
|