cua-agent 0.1.29__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/core/factory.py +19 -0
- agent/core/types.py +1 -0
- agent/providers/uitars/__init__.py +1 -0
- agent/providers/uitars/clients/base.py +35 -0
- agent/providers/uitars/clients/oaicompat.py +204 -0
- agent/providers/uitars/loop.py +595 -0
- agent/providers/uitars/prompts.py +59 -0
- agent/providers/uitars/tools/__init__.py +1 -0
- agent/providers/uitars/tools/computer.py +279 -0
- agent/providers/uitars/tools/manager.py +60 -0
- agent/providers/uitars/utils.py +153 -0
- agent/ui/gradio/app.py +12 -2
- {cua_agent-0.1.29.dist-info → cua_agent-0.1.30.dist-info}/METADATA +12 -18
- {cua_agent-0.1.29.dist-info → cua_agent-0.1.30.dist-info}/RECORD +16 -7
- {cua_agent-0.1.29.dist-info → cua_agent-0.1.30.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.29.dist-info → cua_agent-0.1.30.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""Computer tool for UI-TARS."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import base64
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any, Dict, List, Optional, Literal, Union
|
|
8
|
+
|
|
9
|
+
from computer import Computer
|
|
10
|
+
from ....core.tools.base import ToolResult, ToolFailure
|
|
11
|
+
from ....core.tools.computer import BaseComputerTool
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ComputerTool(BaseComputerTool):
|
|
17
|
+
"""
|
|
18
|
+
A tool that allows the UI-TARS agent to interact with the screen, keyboard, and mouse.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name: str = "computer"
|
|
22
|
+
width: Optional[int] = None
|
|
23
|
+
height: Optional[int] = None
|
|
24
|
+
computer: Computer
|
|
25
|
+
|
|
26
|
+
def __init__(self, computer: Computer):
|
|
27
|
+
"""Initialize the computer tool.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
computer: Computer instance
|
|
31
|
+
"""
|
|
32
|
+
super().__init__(computer)
|
|
33
|
+
self.computer = computer
|
|
34
|
+
self.width = None
|
|
35
|
+
self.height = None
|
|
36
|
+
self.logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
def to_params(self) -> Dict[str, Any]:
|
|
39
|
+
"""Convert tool to API parameters.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Dictionary with tool parameters
|
|
43
|
+
"""
|
|
44
|
+
if self.width is None or self.height is None:
|
|
45
|
+
raise RuntimeError(
|
|
46
|
+
"Screen dimensions not initialized. Call initialize_dimensions() first."
|
|
47
|
+
)
|
|
48
|
+
return {
|
|
49
|
+
"type": "computer",
|
|
50
|
+
"display_width": self.width,
|
|
51
|
+
"display_height": self.height,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async def initialize_dimensions(self) -> None:
|
|
55
|
+
"""Initialize screen dimensions from the computer interface."""
|
|
56
|
+
try:
|
|
57
|
+
display_size = await self.computer.interface.get_screen_size()
|
|
58
|
+
self.width = display_size["width"]
|
|
59
|
+
self.height = display_size["height"]
|
|
60
|
+
self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
|
|
61
|
+
except Exception as e:
|
|
62
|
+
# Fall back to defaults if we can't get accurate dimensions
|
|
63
|
+
self.width = 1024
|
|
64
|
+
self.height = 768
|
|
65
|
+
self.logger.warning(
|
|
66
|
+
f"Failed to get screen dimensions, using defaults: {self.width}x{self.height}. Error: {e}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
async def __call__(
|
|
70
|
+
self,
|
|
71
|
+
*,
|
|
72
|
+
action: str,
|
|
73
|
+
**kwargs,
|
|
74
|
+
) -> ToolResult:
|
|
75
|
+
"""Execute a computer action.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
action: The action to perform (based on UI-TARS action space)
|
|
79
|
+
**kwargs: Additional parameters for the action
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
ToolResult containing action output and possibly a base64 image
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
# Ensure dimensions are initialized
|
|
86
|
+
if self.width is None or self.height is None:
|
|
87
|
+
await self.initialize_dimensions()
|
|
88
|
+
if self.width is None or self.height is None:
|
|
89
|
+
return ToolFailure(error="Failed to initialize screen dimensions")
|
|
90
|
+
|
|
91
|
+
# Handle actions defined in UI-TARS action space (from prompts.py)
|
|
92
|
+
# Handle standard click (left click)
|
|
93
|
+
if action == "click":
|
|
94
|
+
if "x" in kwargs and "y" in kwargs:
|
|
95
|
+
x, y = kwargs["x"], kwargs["y"]
|
|
96
|
+
await self.computer.interface.left_click(x, y)
|
|
97
|
+
|
|
98
|
+
# Wait briefly for UI to update
|
|
99
|
+
await asyncio.sleep(0.5)
|
|
100
|
+
|
|
101
|
+
# Take screenshot after action
|
|
102
|
+
screenshot = await self.computer.interface.screenshot()
|
|
103
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
104
|
+
|
|
105
|
+
return ToolResult(
|
|
106
|
+
output=f"Clicked at ({x}, {y})",
|
|
107
|
+
base64_image=base64_screenshot,
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
return ToolFailure(error="Missing coordinates for click action")
|
|
111
|
+
|
|
112
|
+
# Handle double click
|
|
113
|
+
elif action == "left_double":
|
|
114
|
+
if "x" in kwargs and "y" in kwargs:
|
|
115
|
+
x, y = kwargs["x"], kwargs["y"]
|
|
116
|
+
await self.computer.interface.double_click(x, y)
|
|
117
|
+
|
|
118
|
+
# Wait briefly for UI to update
|
|
119
|
+
await asyncio.sleep(0.5)
|
|
120
|
+
|
|
121
|
+
# Take screenshot after action
|
|
122
|
+
screenshot = await self.computer.interface.screenshot()
|
|
123
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
124
|
+
|
|
125
|
+
return ToolResult(
|
|
126
|
+
output=f"Double-clicked at ({x}, {y})",
|
|
127
|
+
base64_image=base64_screenshot,
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
return ToolFailure(error="Missing coordinates for left_double action")
|
|
131
|
+
|
|
132
|
+
# Handle right click
|
|
133
|
+
elif action == "right_single":
|
|
134
|
+
if "x" in kwargs and "y" in kwargs:
|
|
135
|
+
x, y = kwargs["x"], kwargs["y"]
|
|
136
|
+
await self.computer.interface.right_click(x, y)
|
|
137
|
+
|
|
138
|
+
# Wait briefly for UI to update
|
|
139
|
+
await asyncio.sleep(0.5)
|
|
140
|
+
|
|
141
|
+
# Take screenshot after action
|
|
142
|
+
screenshot = await self.computer.interface.screenshot()
|
|
143
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
144
|
+
|
|
145
|
+
return ToolResult(
|
|
146
|
+
output=f"Right-clicked at ({x}, {y})",
|
|
147
|
+
base64_image=base64_screenshot,
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
return ToolFailure(error="Missing coordinates for right_single action")
|
|
151
|
+
|
|
152
|
+
# Handle typing text
|
|
153
|
+
elif action == "type_text":
|
|
154
|
+
if "text" in kwargs:
|
|
155
|
+
text = kwargs["text"]
|
|
156
|
+
await self.computer.interface.type_text(text)
|
|
157
|
+
|
|
158
|
+
# Wait for UI to update
|
|
159
|
+
await asyncio.sleep(0.3)
|
|
160
|
+
|
|
161
|
+
# Take screenshot after action
|
|
162
|
+
screenshot = await self.computer.interface.screenshot()
|
|
163
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
164
|
+
|
|
165
|
+
return ToolResult(
|
|
166
|
+
output=f"Typed: {text}",
|
|
167
|
+
base64_image=base64_screenshot,
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
return ToolFailure(error="Missing text for type action")
|
|
171
|
+
|
|
172
|
+
# Handle hotkey
|
|
173
|
+
elif action == "hotkey":
|
|
174
|
+
if "keys" in kwargs:
|
|
175
|
+
keys = kwargs["keys"]
|
|
176
|
+
for key in keys:
|
|
177
|
+
await self.computer.interface.press_key(key)
|
|
178
|
+
|
|
179
|
+
# Wait for UI to update
|
|
180
|
+
await asyncio.sleep(0.3)
|
|
181
|
+
|
|
182
|
+
# Take screenshot after action
|
|
183
|
+
screenshot = await self.computer.interface.screenshot()
|
|
184
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
185
|
+
|
|
186
|
+
return ToolResult(
|
|
187
|
+
output=f"Pressed hotkey: {', '.join(keys)}",
|
|
188
|
+
base64_image=base64_screenshot,
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
return ToolFailure(error="Missing keys for hotkey action")
|
|
192
|
+
|
|
193
|
+
# Handle drag action
|
|
194
|
+
elif action == "drag":
|
|
195
|
+
if all(k in kwargs for k in ["start_x", "start_y", "end_x", "end_y"]):
|
|
196
|
+
start_x, start_y = kwargs["start_x"], kwargs["start_y"]
|
|
197
|
+
end_x, end_y = kwargs["end_x"], kwargs["end_y"]
|
|
198
|
+
|
|
199
|
+
# Perform drag
|
|
200
|
+
await self.computer.interface.move_cursor(start_x, start_y)
|
|
201
|
+
await self.computer.interface.drag_to(end_x, end_y)
|
|
202
|
+
|
|
203
|
+
# Wait for UI to update
|
|
204
|
+
await asyncio.sleep(0.5)
|
|
205
|
+
|
|
206
|
+
# Take screenshot after action
|
|
207
|
+
screenshot = await self.computer.interface.screenshot()
|
|
208
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
209
|
+
|
|
210
|
+
return ToolResult(
|
|
211
|
+
output=f"Dragged from ({start_x}, {start_y}) to ({end_x}, {end_y})",
|
|
212
|
+
base64_image=base64_screenshot,
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
return ToolFailure(error="Missing coordinates for drag action")
|
|
216
|
+
|
|
217
|
+
# Handle scroll action
|
|
218
|
+
elif action == "scroll":
|
|
219
|
+
if all(k in kwargs for k in ["x", "y", "direction"]):
|
|
220
|
+
x, y = kwargs["x"], kwargs["y"]
|
|
221
|
+
direction = kwargs["direction"]
|
|
222
|
+
|
|
223
|
+
# Move cursor to position
|
|
224
|
+
await self.computer.interface.move_cursor(x, y)
|
|
225
|
+
|
|
226
|
+
# Scroll based on direction
|
|
227
|
+
if direction == "down":
|
|
228
|
+
await self.computer.interface.scroll_down(5)
|
|
229
|
+
elif direction == "up":
|
|
230
|
+
await self.computer.interface.scroll_up(5)
|
|
231
|
+
elif direction == "right":
|
|
232
|
+
pass # await self.computer.interface.scroll_right(5)
|
|
233
|
+
elif direction == "left":
|
|
234
|
+
pass # await self.computer.interface.scroll_left(5)
|
|
235
|
+
else:
|
|
236
|
+
return ToolFailure(error=f"Invalid scroll direction: {direction}")
|
|
237
|
+
|
|
238
|
+
# Wait for UI to update
|
|
239
|
+
await asyncio.sleep(0.5)
|
|
240
|
+
|
|
241
|
+
# Take screenshot after action
|
|
242
|
+
screenshot = await self.computer.interface.screenshot()
|
|
243
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
244
|
+
|
|
245
|
+
return ToolResult(
|
|
246
|
+
output=f"Scrolled {direction} at ({x}, {y})",
|
|
247
|
+
base64_image=base64_screenshot,
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
return ToolFailure(error="Missing parameters for scroll action")
|
|
251
|
+
|
|
252
|
+
# Handle wait action
|
|
253
|
+
elif action == "wait":
|
|
254
|
+
# Sleep for 5 seconds as specified in the action space
|
|
255
|
+
await asyncio.sleep(5)
|
|
256
|
+
|
|
257
|
+
# Take screenshot after waiting
|
|
258
|
+
screenshot = await self.computer.interface.screenshot()
|
|
259
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
260
|
+
|
|
261
|
+
return ToolResult(
|
|
262
|
+
output="Waited for 5 seconds",
|
|
263
|
+
base64_image=base64_screenshot,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Handle finished action (task completion)
|
|
267
|
+
elif action == "finished":
|
|
268
|
+
content = kwargs.get("content", "Task completed")
|
|
269
|
+
return ToolResult(
|
|
270
|
+
output=f"Task finished: {content}",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
return await self._handle_scroll(action)
|
|
274
|
+
else:
|
|
275
|
+
return ToolFailure(error=f"Unsupported action: {action}")
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
self.logger.error(f"Error in ComputerTool.__call__: {str(e)}")
|
|
279
|
+
return ToolFailure(error=f"Failed to execute {action}: {str(e)}")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Tool manager for the UI-TARS provider."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from computer import Computer
|
|
7
|
+
from ....core.tools import BaseToolManager
|
|
8
|
+
from ....core.tools.collection import ToolCollection
|
|
9
|
+
from .computer import ComputerTool
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ToolManager(BaseToolManager):
|
|
15
|
+
"""Manages UI-TARS provider tool initialization and execution."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, computer: Computer):
|
|
18
|
+
"""Initialize the tool manager.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
computer: Computer instance for computer-related tools
|
|
22
|
+
"""
|
|
23
|
+
super().__init__(computer)
|
|
24
|
+
# Initialize UI-TARS-specific tools
|
|
25
|
+
self.computer_tool = ComputerTool(self.computer)
|
|
26
|
+
self._initialized = False
|
|
27
|
+
|
|
28
|
+
def _initialize_tools(self) -> ToolCollection:
|
|
29
|
+
"""Initialize all available tools."""
|
|
30
|
+
return ToolCollection(self.computer_tool)
|
|
31
|
+
|
|
32
|
+
async def _initialize_tools_specific(self) -> None:
|
|
33
|
+
"""Initialize UI-TARS provider-specific tool requirements."""
|
|
34
|
+
await self.computer_tool.initialize_dimensions()
|
|
35
|
+
|
|
36
|
+
def get_tool_params(self) -> List[Dict[str, Any]]:
|
|
37
|
+
"""Get tool parameters for API calls.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of tool parameters for the current provider's API
|
|
41
|
+
"""
|
|
42
|
+
if self.tools is None:
|
|
43
|
+
raise RuntimeError("Tools not initialized. Call initialize() first.")
|
|
44
|
+
|
|
45
|
+
return self.tools.to_params()
|
|
46
|
+
|
|
47
|
+
async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> Any:
|
|
48
|
+
"""Execute a tool with the given input.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
name: Name of the tool to execute
|
|
52
|
+
tool_input: Input parameters for the tool
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Result of the tool execution
|
|
56
|
+
"""
|
|
57
|
+
if self.tools is None:
|
|
58
|
+
raise RuntimeError("Tools not initialized. Call initialize() first.")
|
|
59
|
+
|
|
60
|
+
return await self.tools.run(name=name, tool_input=tool_input)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Utility functions for the UI-TARS provider."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import base64
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def add_box_token(input_string: str) -> str:
|
|
12
|
+
"""Add box tokens to the coordinates in the model response.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
input_string: Raw model response
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
String with box tokens added
|
|
19
|
+
"""
|
|
20
|
+
if "Action: " not in input_string or "start_box=" not in input_string:
|
|
21
|
+
return input_string
|
|
22
|
+
|
|
23
|
+
suffix = input_string.split("Action: ")[0] + "Action: "
|
|
24
|
+
actions = input_string.split("Action: ")[1:]
|
|
25
|
+
processed_actions = []
|
|
26
|
+
|
|
27
|
+
for action in actions:
|
|
28
|
+
action = action.strip()
|
|
29
|
+
coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
|
|
30
|
+
|
|
31
|
+
updated_action = action
|
|
32
|
+
for coord_type, x, y in coordinates:
|
|
33
|
+
updated_action = updated_action.replace(
|
|
34
|
+
f"{coord_type}='({x},{y})'",
|
|
35
|
+
f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'"
|
|
36
|
+
)
|
|
37
|
+
processed_actions.append(updated_action)
|
|
38
|
+
|
|
39
|
+
return suffix + "\n\n".join(processed_actions)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_actions(response: str) -> List[str]:
|
|
43
|
+
"""Parse actions from UI-TARS model response.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
response: The raw model response text
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of parsed actions
|
|
50
|
+
"""
|
|
51
|
+
actions = []
|
|
52
|
+
# Extract the Action part from the response
|
|
53
|
+
if "Action:" in response:
|
|
54
|
+
action_text = response.split("Action:")[-1].strip()
|
|
55
|
+
# Clean up and format action
|
|
56
|
+
if action_text:
|
|
57
|
+
# Handle multiple actions separated by newlines
|
|
58
|
+
action_parts = action_text.split("\n\n")
|
|
59
|
+
for part in action_parts:
|
|
60
|
+
if part.strip():
|
|
61
|
+
actions.append(part.strip())
|
|
62
|
+
|
|
63
|
+
return actions
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
|
|
67
|
+
"""Parse parameters from an action string.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
action: The action string to parse
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Tuple of (action_name, action_parameters)
|
|
74
|
+
"""
|
|
75
|
+
# Handle "finished" action
|
|
76
|
+
if action.startswith("finished"):
|
|
77
|
+
return "finished", {}
|
|
78
|
+
|
|
79
|
+
# Parse action parameters
|
|
80
|
+
action_match = re.match(r'(\w+)\((.*)\)', action)
|
|
81
|
+
if not action_match:
|
|
82
|
+
logger.warning(f"Could not parse action: {action}")
|
|
83
|
+
return "", {}
|
|
84
|
+
|
|
85
|
+
action_name = action_match.group(1)
|
|
86
|
+
action_params_str = action_match.group(2)
|
|
87
|
+
|
|
88
|
+
tool_args = {"action": action_name}
|
|
89
|
+
|
|
90
|
+
# Extract coordinate values from the action
|
|
91
|
+
if "start_box" in action_params_str:
|
|
92
|
+
# Extract all box coordinates
|
|
93
|
+
box_pattern = r"(start_box|end_box)='(?:<\|box_start\|>)?\((\d+),\s*(\d+)\)(?:<\|box_end\|>)?'"
|
|
94
|
+
box_matches = re.findall(box_pattern, action_params_str)
|
|
95
|
+
|
|
96
|
+
# Handle click-type actions
|
|
97
|
+
if action_name in ["click", "left_double", "right_single"]:
|
|
98
|
+
# Get coordinates from start_box
|
|
99
|
+
for box_type, x, y in box_matches:
|
|
100
|
+
if box_type == "start_box":
|
|
101
|
+
tool_args["x"] = int(x)
|
|
102
|
+
tool_args["y"] = int(y)
|
|
103
|
+
break
|
|
104
|
+
|
|
105
|
+
# Handle drag action
|
|
106
|
+
elif action_name == "drag":
|
|
107
|
+
start_x, start_y = None, None
|
|
108
|
+
end_x, end_y = None, None
|
|
109
|
+
|
|
110
|
+
for box_type, x, y in box_matches:
|
|
111
|
+
if box_type == "start_box":
|
|
112
|
+
start_x, start_y = int(x), int(y)
|
|
113
|
+
elif box_type == "end_box":
|
|
114
|
+
end_x, end_y = int(x), int(y)
|
|
115
|
+
|
|
116
|
+
if not None in [start_x, start_y, end_x, end_y]:
|
|
117
|
+
tool_args["start_x"] = start_x
|
|
118
|
+
tool_args["start_y"] = start_y
|
|
119
|
+
tool_args["end_x"] = end_x
|
|
120
|
+
tool_args["end_y"] = end_y
|
|
121
|
+
|
|
122
|
+
# Handle scroll action
|
|
123
|
+
elif action_name == "scroll":
|
|
124
|
+
# Get coordinates from start_box
|
|
125
|
+
for box_type, x, y in box_matches:
|
|
126
|
+
if box_type == "start_box":
|
|
127
|
+
tool_args["x"] = int(x)
|
|
128
|
+
tool_args["y"] = int(y)
|
|
129
|
+
break
|
|
130
|
+
|
|
131
|
+
# Extract direction
|
|
132
|
+
direction_match = re.search(r"direction='([^']+)'", action_params_str)
|
|
133
|
+
if direction_match:
|
|
134
|
+
tool_args["direction"] = direction_match.group(1)
|
|
135
|
+
|
|
136
|
+
# Handle typing text
|
|
137
|
+
elif action_name == "type":
|
|
138
|
+
# Extract text content
|
|
139
|
+
content_match = re.search(r"content='([^']*)'", action_params_str)
|
|
140
|
+
if content_match:
|
|
141
|
+
# Unescape escaped characters
|
|
142
|
+
text = content_match.group(1).replace("\\'", "'").replace('\\"', '"').replace("\\n", "\n")
|
|
143
|
+
tool_args = {"action": "type_text", "text": text}
|
|
144
|
+
|
|
145
|
+
# Handle hotkey
|
|
146
|
+
elif action_name == "hotkey":
|
|
147
|
+
# Extract key combination
|
|
148
|
+
key_match = re.search(r"key='([^']*)'", action_params_str)
|
|
149
|
+
if key_match:
|
|
150
|
+
keys = key_match.group(1).split()
|
|
151
|
+
tool_args = {"action": "hotkey", "keys": keys}
|
|
152
|
+
|
|
153
|
+
return action_name, tool_args
|
agent/ui/gradio/app.py
CHANGED
|
@@ -162,6 +162,10 @@ MODEL_MAPPINGS = {
|
|
|
162
162
|
"claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
|
|
163
163
|
"claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
|
|
164
164
|
},
|
|
165
|
+
"uitars": {
|
|
166
|
+
# UI-TARS models default to custom endpoint
|
|
167
|
+
"default": "ByteDance-Seed/UI-TARS-1.5-7B",
|
|
168
|
+
},
|
|
165
169
|
"ollama": {
|
|
166
170
|
# For Ollama models, we keep the original name
|
|
167
171
|
"default": "llama3", # A common default model
|
|
@@ -191,6 +195,7 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
191
195
|
"ANTHROPIC": AgentLoop.ANTHROPIC,
|
|
192
196
|
"OMNI": AgentLoop.OMNI,
|
|
193
197
|
"OMNI-OLLAMA": AgentLoop.OMNI, # Special case for Ollama models with OMNI parser
|
|
198
|
+
"UITARS": AgentLoop.UITARS, # UI-TARS implementation
|
|
194
199
|
}
|
|
195
200
|
agent_loop = loop_provider_map.get(loop_provider, AgentLoop.OPENAI)
|
|
196
201
|
|
|
@@ -281,7 +286,9 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
|
|
|
281
286
|
# Assign the determined model name
|
|
282
287
|
model_name_to_use = cleaned_model_name
|
|
283
288
|
# agent_loop remains AgentLoop.OMNI
|
|
284
|
-
|
|
289
|
+
elif agent_loop == AgentLoop.UITARS:
|
|
290
|
+
provider = LLMProvider.OAICOMPAT
|
|
291
|
+
model_name_to_use = MODEL_MAPPINGS["uitars"]["default"] # Default
|
|
285
292
|
else:
|
|
286
293
|
# Default to OpenAI if unrecognized loop
|
|
287
294
|
provider = LLMProvider.OPENAI
|
|
@@ -551,6 +558,7 @@ def create_gradio_ui(
|
|
|
551
558
|
"OPENAI": openai_models,
|
|
552
559
|
"ANTHROPIC": anthropic_models,
|
|
553
560
|
"OMNI": omni_models + ["Custom model..."], # Add custom model option
|
|
561
|
+
"UITARS": ["Custom model..."], # UI-TARS options
|
|
554
562
|
}
|
|
555
563
|
|
|
556
564
|
# --- Apply Saved Settings (override defaults if available) ---
|
|
@@ -692,7 +700,7 @@ def create_gradio_ui(
|
|
|
692
700
|
with gr.Accordion("Configuration", open=True):
|
|
693
701
|
# Configuration options
|
|
694
702
|
agent_loop = gr.Dropdown(
|
|
695
|
-
choices=["OPENAI", "ANTHROPIC", "OMNI"],
|
|
703
|
+
choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
|
|
696
704
|
label="Agent Loop",
|
|
697
705
|
value=initial_loop,
|
|
698
706
|
info="Select the agent loop provider",
|
|
@@ -807,6 +815,8 @@ def create_gradio_ui(
|
|
|
807
815
|
provider, cleaned_model_name_from_func, agent_loop_type = (
|
|
808
816
|
get_provider_and_model(model_string_to_analyze, agent_loop_choice)
|
|
809
817
|
)
|
|
818
|
+
|
|
819
|
+
print(f"provider={provider} cleaned_model_name_from_func={cleaned_model_name_from_func} agent_loop_type={agent_loop_type} agent_loop_choice={agent_loop_choice}")
|
|
810
820
|
|
|
811
821
|
# Determine the final model name to send to the agent
|
|
812
822
|
# If custom selected, use the custom text box value, otherwise use the cleaned name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.30
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -21,6 +21,8 @@ Requires-Dist: boto3<2.0.0,>=1.35.81; extra == "anthropic"
|
|
|
21
21
|
Provides-Extra: openai
|
|
22
22
|
Requires-Dist: openai<2.0.0,>=1.14.0; extra == "openai"
|
|
23
23
|
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
|
|
24
|
+
Provides-Extra: uitars
|
|
25
|
+
Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
|
|
24
26
|
Provides-Extra: ui
|
|
25
27
|
Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
|
|
26
28
|
Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
|
|
@@ -118,6 +120,9 @@ async with Computer() as macos_computer:
|
|
|
118
120
|
# or
|
|
119
121
|
# loop=AgentLoop.OMNI,
|
|
120
122
|
# model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
|
|
123
|
+
# or
|
|
124
|
+
# loop=AgentLoop.UITARS,
|
|
125
|
+
# model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
|
121
126
|
)
|
|
122
127
|
|
|
123
128
|
tasks = [
|
|
@@ -192,6 +197,10 @@ The Gradio UI provides:
|
|
|
192
197
|
- Configuration of agent parameters
|
|
193
198
|
- Chat interface for interacting with the agent
|
|
194
199
|
|
|
200
|
+
### Using UI-TARS
|
|
201
|
+
|
|
202
|
+
You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
|
|
203
|
+
|
|
195
204
|
## Agent Loops
|
|
196
205
|
|
|
197
206
|
The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques:
|
|
@@ -200,6 +209,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
|
|
200
209
|
|:-----------|:-----------------|:------------|:-------------|
|
|
201
210
|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
|
202
211
|
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
|
212
|
+
| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
|
|
203
213
|
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
|
204
214
|
|
|
205
215
|
## AgentResponse
|
|
@@ -241,25 +251,9 @@ async for result in agent.run(task):
|
|
|
241
251
|
print(output)
|
|
242
252
|
```
|
|
243
253
|
|
|
244
|
-
### Gradio UI
|
|
245
|
-
|
|
246
|
-
You can also interact with the agent using a Gradio interface.
|
|
247
|
-
|
|
248
|
-
```python
|
|
249
|
-
# Ensure environment variables (e.g., API keys) are loaded
|
|
250
|
-
# You might need a helper function like load_dotenv_files() if using .env
|
|
251
|
-
# from utils import load_dotenv_files
|
|
252
|
-
# load_dotenv_files()
|
|
253
|
-
|
|
254
|
-
from agent.ui.gradio.app import create_gradio_ui
|
|
255
|
-
|
|
256
|
-
app = create_gradio_ui()
|
|
257
|
-
app.launch(share=False)
|
|
258
|
-
```
|
|
259
|
-
|
|
260
254
|
**Note on Settings Persistence:**
|
|
261
255
|
|
|
262
256
|
* The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task.
|
|
263
257
|
* This allows your preferences to persist between sessions.
|
|
264
258
|
* API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file.
|
|
265
|
-
* It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
|
|
259
|
+
* It's recommended to add `.gradio_settings.json` to your `.gitignore` file.
|
|
@@ -4,7 +4,7 @@ agent/core/agent.py,sha256=HUfBe7Uam3TObAmf6KH0GDKuNCNunNmmMcuxS7aZg0Q,8332
|
|
|
4
4
|
agent/core/base.py,sha256=2sg8B2VqUKImRlkLTNj5lx-Oarlu7_GoMR6MbNzSY9Q,8078
|
|
5
5
|
agent/core/callbacks.py,sha256=FKAxyajJ-ZJ5SxNXoupNcrm0GYBgjOjJEsStqst0EAk,6453
|
|
6
6
|
agent/core/experiment.py,sha256=Ywj6q3JZFDKicfPuQsDl0vSN55HS7-Cnk3u3EcUCKe8,8866
|
|
7
|
-
agent/core/factory.py,sha256=
|
|
7
|
+
agent/core/factory.py,sha256=LSamFOjq2WhGGp5EVyLfDMAUrgHH1C_K5PKpdo24rhU,4573
|
|
8
8
|
agent/core/messages.py,sha256=-OVMDqcxK5MUHPEkHliK29XFJYMRAc1keFvzrUyrOmM,16231
|
|
9
9
|
agent/core/provider_config.py,sha256=Hr9kDFSXdPeqC6hbid3OTykNF0-XVi0wzZyd44a7kww,627
|
|
10
10
|
agent/core/telemetry.py,sha256=HElPd32k_w2SJ6t-Cc3j_2-AKdLbFwh2YlM8QViDgRw,4790
|
|
@@ -16,7 +16,7 @@ agent/core/tools/collection.py,sha256=NuwTn6dXSyznxWodfmFDQwUlxxaGb4oBPym4AEJABS
|
|
|
16
16
|
agent/core/tools/computer.py,sha256=lT_aW3huoYpcM8kffuokELupSz_WZG_qkaW1gITRC58,3892
|
|
17
17
|
agent/core/tools/edit.py,sha256=kv4jTKCM0VXrnoNErf7mT-xlr81-7T8v49_VA9y_L4Y,2005
|
|
18
18
|
agent/core/tools/manager.py,sha256=IRsCXjGc076nncQuyIjODoafnHTDhrf9sP5B4q5Pcdo,1742
|
|
19
|
-
agent/core/types.py,sha256=
|
|
19
|
+
agent/core/types.py,sha256=lDMtMFoBRW82X559VJBpbnNAzRo4LL7BbhT5r_QZFmg,2421
|
|
20
20
|
agent/core/visualization.py,sha256=1DuFF5sSeSf5BRSevBMDxml9-ajl7BQLFm5KBUwMbI8,6573
|
|
21
21
|
agent/providers/__init__.py,sha256=b4tIBAaIB1V7p8V0BWipHVnMhfHH_OuVgP4OWGSHdD8,194
|
|
22
22
|
agent/providers/anthropic/__init__.py,sha256=Mj11IZnVshZ2iHkvg4Z5-jrQIaD1WvzDz2Zk_pMwqIA,149
|
|
@@ -66,11 +66,20 @@ agent/providers/openai/tools/computer.py,sha256=jZUr-IOjlYoGOYNiXF6AYdTY4Wch86aS
|
|
|
66
66
|
agent/providers/openai/tools/manager.py,sha256=-wM641dLf8vcv6QF9x_ViGJeDl2YTuUV93j6u7GBI18,3903
|
|
67
67
|
agent/providers/openai/types.py,sha256=0mFUxeFy23fJhMwc6lAFVXKngg2fJIXkPS5oV284V1M,898
|
|
68
68
|
agent/providers/openai/utils.py,sha256=YeCZWIqOFSeugWoqAS0rhxOKAfL-9uN9nrYSBGBgPdc,3175
|
|
69
|
+
agent/providers/uitars/__init__.py,sha256=sq5OMVJP9E_sok9tIiKJreGkjmNWXPMObjPTClYv1es,38
|
|
70
|
+
agent/providers/uitars/clients/base.py,sha256=5w8Ajmq1JiPyUQJUAq1lSkfpA8_Ts80NQiDxPMTtQrI,948
|
|
71
|
+
agent/providers/uitars/clients/oaicompat.py,sha256=y3ieCZjNIdKUjSDYnP7SEJ5cCQzUJhv1rD8p_vpOWPw,8845
|
|
72
|
+
agent/providers/uitars/loop.py,sha256=l6OZKJmBmA1qJJbACqU0HrUtzrJoJma-0ida6WrlZZY,23500
|
|
73
|
+
agent/providers/uitars/prompts.py,sha256=XP8XE2KvDPxat8cDmIJuLHCq3iqO_7IOPWTKYB3WMHQ,2328
|
|
74
|
+
agent/providers/uitars/tools/__init__.py,sha256=0hc3W6u5TvcXYztYKIyve_C2G3XMfwt_y7grmH0ZHC0,29
|
|
75
|
+
agent/providers/uitars/tools/computer.py,sha256=WpbpZA9tFcr3zGBlO0CpwUhKmiOsuwh5zlVzu0Ormks,11641
|
|
76
|
+
agent/providers/uitars/tools/manager.py,sha256=2dK9STtz6NuZG3i0nH7ZuHJpb7vKJ2mOVbxGsb0t8lQ,1945
|
|
77
|
+
agent/providers/uitars/utils.py,sha256=y3B91_a5D9hWx4PQl5KNEoZ2G2jUAGZe4m8-m_iI9qw,5184
|
|
69
78
|
agent/telemetry.py,sha256=pVGxbj0ewnvq4EGj28CydN4a1iOfvZR_XKL3vIOqhOM,390
|
|
70
79
|
agent/ui/__init__.py,sha256=ohhxJLBin6k1hl5sKcmBST8mgh23WXgAXz3pN4f470E,45
|
|
71
80
|
agent/ui/gradio/__init__.py,sha256=ANKZhv1HqsLheWbLVBlyRQ7Q5qGeXuPi5jDs8vu-ZMo,579
|
|
72
|
-
agent/ui/gradio/app.py,sha256=
|
|
73
|
-
cua_agent-0.1.
|
|
74
|
-
cua_agent-0.1.
|
|
75
|
-
cua_agent-0.1.
|
|
76
|
-
cua_agent-0.1.
|
|
81
|
+
agent/ui/gradio/app.py,sha256=6dnGEF_YOrlEp8qcfMeQKcZvm3VAFzZFF-lsEpQF1as,41989
|
|
82
|
+
cua_agent-0.1.30.dist-info/METADATA,sha256=wBhcFokroLwf-0hGXcn9ZP-KcN-KT4ZAPCveTcKKzQ8,11179
|
|
83
|
+
cua_agent-0.1.30.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
|
|
84
|
+
cua_agent-0.1.30.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
85
|
+
cua_agent-0.1.30.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|