cua-agent 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +21 -12
- agent/__main__.py +21 -0
- agent/adapters/__init__.py +9 -0
- agent/adapters/huggingfacelocal_adapter.py +229 -0
- agent/agent.py +594 -0
- agent/callbacks/__init__.py +19 -0
- agent/callbacks/base.py +153 -0
- agent/callbacks/budget_manager.py +44 -0
- agent/callbacks/image_retention.py +139 -0
- agent/callbacks/logging.py +247 -0
- agent/callbacks/pii_anonymization.py +259 -0
- agent/callbacks/telemetry.py +210 -0
- agent/callbacks/trajectory_saver.py +305 -0
- agent/cli.py +297 -0
- agent/computer_handler.py +107 -0
- agent/decorators.py +90 -0
- agent/loops/__init__.py +11 -0
- agent/loops/anthropic.py +728 -0
- agent/loops/omniparser.py +339 -0
- agent/loops/openai.py +95 -0
- agent/loops/uitars.py +688 -0
- agent/responses.py +207 -0
- agent/telemetry.py +135 -14
- agent/types.py +79 -0
- agent/ui/__init__.py +7 -1
- agent/ui/__main__.py +2 -13
- agent/ui/gradio/__init__.py +6 -19
- agent/ui/gradio/app.py +94 -1313
- agent/ui/gradio/ui_components.py +721 -0
- cua_agent-0.4.0.dist-info/METADATA +424 -0
- cua_agent-0.4.0.dist-info/RECORD +33 -0
- {cua_agent-0.3.1.dist-info → cua_agent-0.4.0.dist-info}/WHEEL +1 -1
- agent/core/__init__.py +0 -27
- agent/core/agent.py +0 -210
- agent/core/base.py +0 -217
- agent/core/callbacks.py +0 -200
- agent/core/experiment.py +0 -249
- agent/core/factory.py +0 -122
- agent/core/messages.py +0 -332
- agent/core/provider_config.py +0 -21
- agent/core/telemetry.py +0 -142
- agent/core/tools/__init__.py +0 -21
- agent/core/tools/base.py +0 -74
- agent/core/tools/bash.py +0 -52
- agent/core/tools/collection.py +0 -46
- agent/core/tools/computer.py +0 -113
- agent/core/tools/edit.py +0 -67
- agent/core/tools/manager.py +0 -56
- agent/core/tools.py +0 -32
- agent/core/types.py +0 -88
- agent/core/visualization.py +0 -197
- agent/providers/__init__.py +0 -4
- agent/providers/anthropic/__init__.py +0 -6
- agent/providers/anthropic/api/client.py +0 -360
- agent/providers/anthropic/api/logging.py +0 -150
- agent/providers/anthropic/api_handler.py +0 -140
- agent/providers/anthropic/callbacks/__init__.py +0 -5
- agent/providers/anthropic/callbacks/manager.py +0 -65
- agent/providers/anthropic/loop.py +0 -568
- agent/providers/anthropic/prompts.py +0 -23
- agent/providers/anthropic/response_handler.py +0 -226
- agent/providers/anthropic/tools/__init__.py +0 -33
- agent/providers/anthropic/tools/base.py +0 -88
- agent/providers/anthropic/tools/bash.py +0 -66
- agent/providers/anthropic/tools/collection.py +0 -34
- agent/providers/anthropic/tools/computer.py +0 -396
- agent/providers/anthropic/tools/edit.py +0 -326
- agent/providers/anthropic/tools/manager.py +0 -54
- agent/providers/anthropic/tools/run.py +0 -42
- agent/providers/anthropic/types.py +0 -16
- agent/providers/anthropic/utils.py +0 -367
- agent/providers/omni/__init__.py +0 -8
- agent/providers/omni/api_handler.py +0 -42
- agent/providers/omni/clients/anthropic.py +0 -103
- agent/providers/omni/clients/base.py +0 -35
- agent/providers/omni/clients/oaicompat.py +0 -195
- agent/providers/omni/clients/ollama.py +0 -122
- agent/providers/omni/clients/openai.py +0 -155
- agent/providers/omni/clients/utils.py +0 -25
- agent/providers/omni/image_utils.py +0 -34
- agent/providers/omni/loop.py +0 -990
- agent/providers/omni/parser.py +0 -307
- agent/providers/omni/prompts.py +0 -64
- agent/providers/omni/tools/__init__.py +0 -30
- agent/providers/omni/tools/base.py +0 -29
- agent/providers/omni/tools/bash.py +0 -74
- agent/providers/omni/tools/computer.py +0 -179
- agent/providers/omni/tools/manager.py +0 -61
- agent/providers/omni/utils.py +0 -236
- agent/providers/openai/__init__.py +0 -6
- agent/providers/openai/api_handler.py +0 -456
- agent/providers/openai/loop.py +0 -472
- agent/providers/openai/response_handler.py +0 -205
- agent/providers/openai/tools/__init__.py +0 -15
- agent/providers/openai/tools/base.py +0 -79
- agent/providers/openai/tools/computer.py +0 -326
- agent/providers/openai/tools/manager.py +0 -106
- agent/providers/openai/types.py +0 -36
- agent/providers/openai/utils.py +0 -98
- agent/providers/uitars/__init__.py +0 -1
- agent/providers/uitars/clients/base.py +0 -35
- agent/providers/uitars/clients/mlxvlm.py +0 -263
- agent/providers/uitars/clients/oaicompat.py +0 -214
- agent/providers/uitars/loop.py +0 -660
- agent/providers/uitars/prompts.py +0 -63
- agent/providers/uitars/tools/__init__.py +0 -1
- agent/providers/uitars/tools/computer.py +0 -283
- agent/providers/uitars/tools/manager.py +0 -60
- agent/providers/uitars/utils.py +0 -264
- cua_agent-0.3.1.dist-info/METADATA +0 -295
- cua_agent-0.3.1.dist-info/RECORD +0 -87
- {cua_agent-0.3.1.dist-info → cua_agent-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
"""Prompts for UI-TARS agent."""
|
|
2
|
-
|
|
3
|
-
MAC_SPECIFIC_NOTES = """
|
|
4
|
-
(You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
SYSTEM_PROMPT = "You are a helpful assistant."
|
|
8
|
-
|
|
9
|
-
COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
10
|
-
|
|
11
|
-
## Output Format
|
|
12
|
-
```
|
|
13
|
-
Thought: ...
|
|
14
|
-
Action: ...
|
|
15
|
-
```
|
|
16
|
-
|
|
17
|
-
## Action Space
|
|
18
|
-
|
|
19
|
-
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
20
|
-
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
21
|
-
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
22
|
-
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
|
23
|
-
hotkey(key='')
|
|
24
|
-
type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
|
25
|
-
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
|
26
|
-
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
27
|
-
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
## Note
|
|
31
|
-
- Use {language} in `Thought` part.
|
|
32
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
|
33
|
-
|
|
34
|
-
## User Instruction
|
|
35
|
-
{instruction}
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
MOBILE_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
39
|
-
## Output Format
|
|
40
|
-
```
|
|
41
|
-
Thought: ...
|
|
42
|
-
Action: ...
|
|
43
|
-
```
|
|
44
|
-
## Action Space
|
|
45
|
-
|
|
46
|
-
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
47
|
-
long_press(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
48
|
-
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
|
49
|
-
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
|
50
|
-
open_app(app_name=\'\')
|
|
51
|
-
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
|
52
|
-
press_home()
|
|
53
|
-
press_back()
|
|
54
|
-
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
## Note
|
|
58
|
-
- Use {language} in `Thought` part.
|
|
59
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
|
60
|
-
|
|
61
|
-
## User Instruction
|
|
62
|
-
{instruction}
|
|
63
|
-
"""
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""UI-TARS tools package."""
|
|
@@ -1,283 +0,0 @@
|
|
|
1
|
-
"""Computer tool for UI-TARS."""
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import base64
|
|
5
|
-
import logging
|
|
6
|
-
import re
|
|
7
|
-
from typing import Any, Dict, List, Optional, Literal, Union
|
|
8
|
-
|
|
9
|
-
from computer import Computer
|
|
10
|
-
from ....core.tools.base import ToolResult, ToolFailure
|
|
11
|
-
from ....core.tools.computer import BaseComputerTool
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class ComputerTool(BaseComputerTool):
|
|
17
|
-
"""
|
|
18
|
-
A tool that allows the UI-TARS agent to interact with the screen, keyboard, and mouse.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
name: str = "computer"
|
|
22
|
-
width: Optional[int] = None
|
|
23
|
-
height: Optional[int] = None
|
|
24
|
-
computer: Computer
|
|
25
|
-
|
|
26
|
-
def __init__(self, computer: Computer):
|
|
27
|
-
"""Initialize the computer tool.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
computer: Computer instance
|
|
31
|
-
"""
|
|
32
|
-
super().__init__(computer)
|
|
33
|
-
self.computer = computer
|
|
34
|
-
self.width = None
|
|
35
|
-
self.height = None
|
|
36
|
-
self.logger = logging.getLogger(__name__)
|
|
37
|
-
|
|
38
|
-
def to_params(self) -> Dict[str, Any]:
|
|
39
|
-
"""Convert tool to API parameters.
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
Dictionary with tool parameters
|
|
43
|
-
"""
|
|
44
|
-
if self.width is None or self.height is None:
|
|
45
|
-
raise RuntimeError(
|
|
46
|
-
"Screen dimensions not initialized. Call initialize_dimensions() first."
|
|
47
|
-
)
|
|
48
|
-
return {
|
|
49
|
-
"type": "computer",
|
|
50
|
-
"display_width": self.width,
|
|
51
|
-
"display_height": self.height,
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
async def initialize_dimensions(self) -> None:
|
|
55
|
-
"""Initialize screen dimensions from the computer interface."""
|
|
56
|
-
try:
|
|
57
|
-
display_size = await self.computer.interface.get_screen_size()
|
|
58
|
-
self.width = display_size["width"]
|
|
59
|
-
self.height = display_size["height"]
|
|
60
|
-
self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
|
|
61
|
-
except Exception as e:
|
|
62
|
-
# Fall back to defaults if we can't get accurate dimensions
|
|
63
|
-
self.width = 1024
|
|
64
|
-
self.height = 768
|
|
65
|
-
self.logger.warning(
|
|
66
|
-
f"Failed to get screen dimensions, using defaults: {self.width}x{self.height}. Error: {e}"
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
async def __call__(
|
|
70
|
-
self,
|
|
71
|
-
*,
|
|
72
|
-
action: str,
|
|
73
|
-
**kwargs,
|
|
74
|
-
) -> ToolResult:
|
|
75
|
-
"""Execute a computer action.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
action: The action to perform (based on UI-TARS action space)
|
|
79
|
-
**kwargs: Additional parameters for the action
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
ToolResult containing action output and possibly a base64 image
|
|
83
|
-
"""
|
|
84
|
-
try:
|
|
85
|
-
# Ensure dimensions are initialized
|
|
86
|
-
if self.width is None or self.height is None:
|
|
87
|
-
await self.initialize_dimensions()
|
|
88
|
-
if self.width is None or self.height is None:
|
|
89
|
-
return ToolFailure(error="Failed to initialize screen dimensions")
|
|
90
|
-
|
|
91
|
-
# Handle actions defined in UI-TARS action space (from prompts.py)
|
|
92
|
-
# Handle standard click (left click)
|
|
93
|
-
if action == "click":
|
|
94
|
-
if "x" in kwargs and "y" in kwargs:
|
|
95
|
-
x, y = kwargs["x"], kwargs["y"]
|
|
96
|
-
await self.computer.interface.left_click(x, y)
|
|
97
|
-
|
|
98
|
-
# Wait briefly for UI to update
|
|
99
|
-
await asyncio.sleep(0.5)
|
|
100
|
-
|
|
101
|
-
# Take screenshot after action
|
|
102
|
-
screenshot = await self.computer.interface.screenshot()
|
|
103
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
104
|
-
|
|
105
|
-
return ToolResult(
|
|
106
|
-
output=f"Clicked at ({x}, {y})",
|
|
107
|
-
base64_image=base64_screenshot,
|
|
108
|
-
)
|
|
109
|
-
else:
|
|
110
|
-
return ToolFailure(error="Missing coordinates for click action")
|
|
111
|
-
|
|
112
|
-
# Handle double click
|
|
113
|
-
elif action == "left_double":
|
|
114
|
-
if "x" in kwargs and "y" in kwargs:
|
|
115
|
-
x, y = kwargs["x"], kwargs["y"]
|
|
116
|
-
await self.computer.interface.double_click(x, y)
|
|
117
|
-
|
|
118
|
-
# Wait briefly for UI to update
|
|
119
|
-
await asyncio.sleep(0.5)
|
|
120
|
-
|
|
121
|
-
# Take screenshot after action
|
|
122
|
-
screenshot = await self.computer.interface.screenshot()
|
|
123
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
124
|
-
|
|
125
|
-
return ToolResult(
|
|
126
|
-
output=f"Double-clicked at ({x}, {y})",
|
|
127
|
-
base64_image=base64_screenshot,
|
|
128
|
-
)
|
|
129
|
-
else:
|
|
130
|
-
return ToolFailure(error="Missing coordinates for left_double action")
|
|
131
|
-
|
|
132
|
-
# Handle right click
|
|
133
|
-
elif action == "right_single":
|
|
134
|
-
if "x" in kwargs and "y" in kwargs:
|
|
135
|
-
x, y = kwargs["x"], kwargs["y"]
|
|
136
|
-
await self.computer.interface.right_click(x, y)
|
|
137
|
-
|
|
138
|
-
# Wait briefly for UI to update
|
|
139
|
-
await asyncio.sleep(0.5)
|
|
140
|
-
|
|
141
|
-
# Take screenshot after action
|
|
142
|
-
screenshot = await self.computer.interface.screenshot()
|
|
143
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
144
|
-
|
|
145
|
-
return ToolResult(
|
|
146
|
-
output=f"Right-clicked at ({x}, {y})",
|
|
147
|
-
base64_image=base64_screenshot,
|
|
148
|
-
)
|
|
149
|
-
else:
|
|
150
|
-
return ToolFailure(error="Missing coordinates for right_single action")
|
|
151
|
-
|
|
152
|
-
# Handle typing text
|
|
153
|
-
elif action == "type_text":
|
|
154
|
-
if "text" in kwargs:
|
|
155
|
-
text = kwargs["text"]
|
|
156
|
-
await self.computer.interface.type_text(text)
|
|
157
|
-
|
|
158
|
-
# Wait for UI to update
|
|
159
|
-
await asyncio.sleep(0.3)
|
|
160
|
-
|
|
161
|
-
# Take screenshot after action
|
|
162
|
-
screenshot = await self.computer.interface.screenshot()
|
|
163
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
164
|
-
|
|
165
|
-
return ToolResult(
|
|
166
|
-
output=f"Typed: {text}",
|
|
167
|
-
base64_image=base64_screenshot,
|
|
168
|
-
)
|
|
169
|
-
else:
|
|
170
|
-
return ToolFailure(error="Missing text for type action")
|
|
171
|
-
|
|
172
|
-
# Handle hotkey
|
|
173
|
-
elif action == "hotkey":
|
|
174
|
-
if "keys" in kwargs:
|
|
175
|
-
keys = kwargs["keys"]
|
|
176
|
-
|
|
177
|
-
if len(keys) > 1:
|
|
178
|
-
await self.computer.interface.hotkey(*keys)
|
|
179
|
-
else:
|
|
180
|
-
# Single key press
|
|
181
|
-
await self.computer.interface.press_key(keys[0])
|
|
182
|
-
|
|
183
|
-
# Wait for UI to update
|
|
184
|
-
await asyncio.sleep(0.3)
|
|
185
|
-
|
|
186
|
-
# Take screenshot after action
|
|
187
|
-
screenshot = await self.computer.interface.screenshot()
|
|
188
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
189
|
-
|
|
190
|
-
return ToolResult(
|
|
191
|
-
output=f"Pressed hotkey: {', '.join(keys)}",
|
|
192
|
-
base64_image=base64_screenshot,
|
|
193
|
-
)
|
|
194
|
-
else:
|
|
195
|
-
return ToolFailure(error="Missing keys for hotkey action")
|
|
196
|
-
|
|
197
|
-
# Handle drag action
|
|
198
|
-
elif action == "drag":
|
|
199
|
-
if all(k in kwargs for k in ["start_x", "start_y", "end_x", "end_y"]):
|
|
200
|
-
start_x, start_y = kwargs["start_x"], kwargs["start_y"]
|
|
201
|
-
end_x, end_y = kwargs["end_x"], kwargs["end_y"]
|
|
202
|
-
|
|
203
|
-
# Perform drag
|
|
204
|
-
await self.computer.interface.move_cursor(start_x, start_y)
|
|
205
|
-
await self.computer.interface.drag_to(end_x, end_y)
|
|
206
|
-
|
|
207
|
-
# Wait for UI to update
|
|
208
|
-
await asyncio.sleep(0.5)
|
|
209
|
-
|
|
210
|
-
# Take screenshot after action
|
|
211
|
-
screenshot = await self.computer.interface.screenshot()
|
|
212
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
213
|
-
|
|
214
|
-
return ToolResult(
|
|
215
|
-
output=f"Dragged from ({start_x}, {start_y}) to ({end_x}, {end_y})",
|
|
216
|
-
base64_image=base64_screenshot,
|
|
217
|
-
)
|
|
218
|
-
else:
|
|
219
|
-
return ToolFailure(error="Missing coordinates for drag action")
|
|
220
|
-
|
|
221
|
-
# Handle scroll action
|
|
222
|
-
elif action == "scroll":
|
|
223
|
-
if all(k in kwargs for k in ["x", "y", "direction"]):
|
|
224
|
-
x, y = kwargs["x"], kwargs["y"]
|
|
225
|
-
direction = kwargs["direction"]
|
|
226
|
-
|
|
227
|
-
# Move cursor to position
|
|
228
|
-
await self.computer.interface.move_cursor(x, y)
|
|
229
|
-
|
|
230
|
-
# Scroll based on direction
|
|
231
|
-
if direction == "down":
|
|
232
|
-
await self.computer.interface.scroll_down(5)
|
|
233
|
-
elif direction == "up":
|
|
234
|
-
await self.computer.interface.scroll_up(5)
|
|
235
|
-
elif direction == "right":
|
|
236
|
-
pass # await self.computer.interface.scroll_right(5)
|
|
237
|
-
elif direction == "left":
|
|
238
|
-
pass # await self.computer.interface.scroll_left(5)
|
|
239
|
-
else:
|
|
240
|
-
return ToolFailure(error=f"Invalid scroll direction: {direction}")
|
|
241
|
-
|
|
242
|
-
# Wait for UI to update
|
|
243
|
-
await asyncio.sleep(0.5)
|
|
244
|
-
|
|
245
|
-
# Take screenshot after action
|
|
246
|
-
screenshot = await self.computer.interface.screenshot()
|
|
247
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
248
|
-
|
|
249
|
-
return ToolResult(
|
|
250
|
-
output=f"Scrolled {direction} at ({x}, {y})",
|
|
251
|
-
base64_image=base64_screenshot,
|
|
252
|
-
)
|
|
253
|
-
else:
|
|
254
|
-
return ToolFailure(error="Missing parameters for scroll action")
|
|
255
|
-
|
|
256
|
-
# Handle wait action
|
|
257
|
-
elif action == "wait":
|
|
258
|
-
# Sleep for 5 seconds as specified in the action space
|
|
259
|
-
await asyncio.sleep(5)
|
|
260
|
-
|
|
261
|
-
# Take screenshot after waiting
|
|
262
|
-
screenshot = await self.computer.interface.screenshot()
|
|
263
|
-
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
264
|
-
|
|
265
|
-
return ToolResult(
|
|
266
|
-
output="Waited for 5 seconds",
|
|
267
|
-
base64_image=base64_screenshot,
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
# Handle finished action (task completion)
|
|
271
|
-
elif action == "finished":
|
|
272
|
-
content = kwargs.get("content", "Task completed")
|
|
273
|
-
return ToolResult(
|
|
274
|
-
output=f"Task finished: {content}",
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
return await self._handle_scroll(action)
|
|
278
|
-
else:
|
|
279
|
-
return ToolFailure(error=f"Unsupported action: {action}")
|
|
280
|
-
|
|
281
|
-
except Exception as e:
|
|
282
|
-
self.logger.error(f"Error in ComputerTool.__call__: {str(e)}")
|
|
283
|
-
return ToolFailure(error=f"Failed to execute {action}: {str(e)}")
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
"""Tool manager for the UI-TARS provider."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
-
|
|
6
|
-
from computer import Computer
|
|
7
|
-
from ....core.tools import BaseToolManager
|
|
8
|
-
from ....core.tools.collection import ToolCollection
|
|
9
|
-
from .computer import ComputerTool
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class ToolManager(BaseToolManager):
|
|
15
|
-
"""Manages UI-TARS provider tool initialization and execution."""
|
|
16
|
-
|
|
17
|
-
def __init__(self, computer: Computer):
|
|
18
|
-
"""Initialize the tool manager.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
computer: Computer instance for computer-related tools
|
|
22
|
-
"""
|
|
23
|
-
super().__init__(computer)
|
|
24
|
-
# Initialize UI-TARS-specific tools
|
|
25
|
-
self.computer_tool = ComputerTool(self.computer)
|
|
26
|
-
self._initialized = False
|
|
27
|
-
|
|
28
|
-
def _initialize_tools(self) -> ToolCollection:
|
|
29
|
-
"""Initialize all available tools."""
|
|
30
|
-
return ToolCollection(self.computer_tool)
|
|
31
|
-
|
|
32
|
-
async def _initialize_tools_specific(self) -> None:
|
|
33
|
-
"""Initialize UI-TARS provider-specific tool requirements."""
|
|
34
|
-
await self.computer_tool.initialize_dimensions()
|
|
35
|
-
|
|
36
|
-
def get_tool_params(self) -> List[Dict[str, Any]]:
|
|
37
|
-
"""Get tool parameters for API calls.
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
List of tool parameters for the current provider's API
|
|
41
|
-
"""
|
|
42
|
-
if self.tools is None:
|
|
43
|
-
raise RuntimeError("Tools not initialized. Call initialize() first.")
|
|
44
|
-
|
|
45
|
-
return self.tools.to_params()
|
|
46
|
-
|
|
47
|
-
async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> Any:
|
|
48
|
-
"""Execute a tool with the given input.
|
|
49
|
-
|
|
50
|
-
Args:
|
|
51
|
-
name: Name of the tool to execute
|
|
52
|
-
tool_input: Input parameters for the tool
|
|
53
|
-
|
|
54
|
-
Returns:
|
|
55
|
-
Result of the tool execution
|
|
56
|
-
"""
|
|
57
|
-
if self.tools is None:
|
|
58
|
-
raise RuntimeError("Tools not initialized. Call initialize() first.")
|
|
59
|
-
|
|
60
|
-
return await self.tools.run(name=name, tool_input=tool_input)
|
agent/providers/uitars/utils.py
DELETED
|
@@ -1,264 +0,0 @@
|
|
|
1
|
-
"""Utility functions for the UI-TARS provider."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import base64
|
|
5
|
-
import re
|
|
6
|
-
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
|
|
11
|
-
from ...core.types import AgentResponse
|
|
12
|
-
|
|
13
|
-
async def to_agent_response_format(
|
|
14
|
-
response: Dict[str, Any],
|
|
15
|
-
messages: List[Dict[str, Any]],
|
|
16
|
-
model: Optional[str] = None,
|
|
17
|
-
) -> AgentResponse:
|
|
18
|
-
"""Convert raw UI-TARS response to agent response format.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
response: Raw UI-TARS response
|
|
22
|
-
messages: List of messages in standard format
|
|
23
|
-
model: Optional model name
|
|
24
|
-
|
|
25
|
-
Returns:
|
|
26
|
-
AgentResponse: Standardized agent response format
|
|
27
|
-
"""
|
|
28
|
-
# Create unique IDs for this response
|
|
29
|
-
response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
|
|
30
|
-
reasoning_id = f"rs_{response_id}"
|
|
31
|
-
action_id = f"cu_{response_id}"
|
|
32
|
-
call_id = f"call_{response_id}"
|
|
33
|
-
|
|
34
|
-
# Parse actions from the raw response
|
|
35
|
-
content = response["choices"][0]["message"]["content"]
|
|
36
|
-
actions = parse_actions(content)
|
|
37
|
-
|
|
38
|
-
# Extract thought content if available
|
|
39
|
-
reasoning_text = ""
|
|
40
|
-
if "Thought:" in content:
|
|
41
|
-
thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
|
|
42
|
-
if thought_match:
|
|
43
|
-
reasoning_text = thought_match.group(1).strip()
|
|
44
|
-
|
|
45
|
-
# Create output items
|
|
46
|
-
output_items = []
|
|
47
|
-
if reasoning_text:
|
|
48
|
-
output_items.append({
|
|
49
|
-
"type": "reasoning",
|
|
50
|
-
"id": reasoning_id,
|
|
51
|
-
"text": reasoning_text
|
|
52
|
-
})
|
|
53
|
-
if actions:
|
|
54
|
-
for i, action in enumerate(actions):
|
|
55
|
-
action_name, tool_args = parse_action_parameters(action)
|
|
56
|
-
if action_name == "finished":
|
|
57
|
-
output_items.append({
|
|
58
|
-
"type": "message",
|
|
59
|
-
"role": "assistant",
|
|
60
|
-
"content": [{
|
|
61
|
-
"type": "output_text",
|
|
62
|
-
"text": tool_args["content"]
|
|
63
|
-
}],
|
|
64
|
-
"id": f"action_{i}_{action_id}",
|
|
65
|
-
"status": "completed"
|
|
66
|
-
})
|
|
67
|
-
else:
|
|
68
|
-
if tool_args.get("action") == action_name:
|
|
69
|
-
del tool_args["action"]
|
|
70
|
-
output_items.append({
|
|
71
|
-
"type": "computer_call",
|
|
72
|
-
"id": f"{action}_{i}_{action_id}",
|
|
73
|
-
"call_id": f"call_{i}_{action_id}",
|
|
74
|
-
"action": { "type": action_name, **tool_args },
|
|
75
|
-
"pending_safety_checks": [],
|
|
76
|
-
"status": "completed"
|
|
77
|
-
})
|
|
78
|
-
|
|
79
|
-
# Create agent response
|
|
80
|
-
agent_response = AgentResponse(
|
|
81
|
-
id=response_id,
|
|
82
|
-
object="response",
|
|
83
|
-
created_at=int(datetime.now().timestamp()),
|
|
84
|
-
status="completed",
|
|
85
|
-
error=None,
|
|
86
|
-
incomplete_details=None,
|
|
87
|
-
instructions=None,
|
|
88
|
-
max_output_tokens=None,
|
|
89
|
-
model=model or response["model"],
|
|
90
|
-
output=output_items,
|
|
91
|
-
parallel_tool_calls=True,
|
|
92
|
-
previous_response_id=None,
|
|
93
|
-
reasoning={"effort": "medium"},
|
|
94
|
-
store=True,
|
|
95
|
-
temperature=0.0,
|
|
96
|
-
top_p=0.7,
|
|
97
|
-
text={"format": {"type": "text"}},
|
|
98
|
-
tool_choice="auto",
|
|
99
|
-
tools=[
|
|
100
|
-
{
|
|
101
|
-
"type": "computer_use_preview",
|
|
102
|
-
"display_height": 768,
|
|
103
|
-
"display_width": 1024,
|
|
104
|
-
"environment": "mac",
|
|
105
|
-
}
|
|
106
|
-
],
|
|
107
|
-
truncation="auto",
|
|
108
|
-
usage=response.get("usage", {}),
|
|
109
|
-
user=None,
|
|
110
|
-
metadata={},
|
|
111
|
-
response=response
|
|
112
|
-
)
|
|
113
|
-
return agent_response
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def add_box_token(input_string: str) -> str:
|
|
117
|
-
"""Add box tokens to the coordinates in the model response.
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
input_string: Raw model response
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
String with box tokens added
|
|
124
|
-
"""
|
|
125
|
-
if "Action: " not in input_string or "start_box=" not in input_string:
|
|
126
|
-
return input_string
|
|
127
|
-
|
|
128
|
-
suffix = input_string.split("Action: ")[0] + "Action: "
|
|
129
|
-
actions = input_string.split("Action: ")[1:]
|
|
130
|
-
processed_actions = []
|
|
131
|
-
|
|
132
|
-
for action in actions:
|
|
133
|
-
action = action.strip()
|
|
134
|
-
coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
|
|
135
|
-
|
|
136
|
-
updated_action = action
|
|
137
|
-
for coord_type, x, y in coordinates:
|
|
138
|
-
updated_action = updated_action.replace(
|
|
139
|
-
f"{coord_type}='({x},{y})'",
|
|
140
|
-
f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'"
|
|
141
|
-
)
|
|
142
|
-
processed_actions.append(updated_action)
|
|
143
|
-
|
|
144
|
-
return suffix + "\n\n".join(processed_actions)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def parse_actions(response: str) -> List[str]:
|
|
148
|
-
"""Parse actions from UI-TARS model response.
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
response: The raw model response text
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
List of parsed actions
|
|
155
|
-
"""
|
|
156
|
-
actions = []
|
|
157
|
-
# Extract the Action part from the response
|
|
158
|
-
if "Action:" in response:
|
|
159
|
-
action_text = response.split("Action:")[-1].strip()
|
|
160
|
-
# Clean up and format action
|
|
161
|
-
if action_text:
|
|
162
|
-
# Handle multiple actions separated by newlines
|
|
163
|
-
action_parts = action_text.split("\n\n")
|
|
164
|
-
for part in action_parts:
|
|
165
|
-
if part.strip():
|
|
166
|
-
actions.append(part.strip())
|
|
167
|
-
|
|
168
|
-
return actions
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
|
|
172
|
-
"""Parse parameters from an action string.
|
|
173
|
-
|
|
174
|
-
Args:
|
|
175
|
-
action: The action string to parse
|
|
176
|
-
|
|
177
|
-
Returns:
|
|
178
|
-
Tuple of (action_name, action_parameters)
|
|
179
|
-
"""
|
|
180
|
-
# Handle "finished" action
|
|
181
|
-
if action.startswith("finished"):
|
|
182
|
-
# Parse content if it exists
|
|
183
|
-
content_match = re.search(r"content='([^']*)'", action)
|
|
184
|
-
if content_match:
|
|
185
|
-
content = content_match.group(1)
|
|
186
|
-
return "finished", {"content": content}
|
|
187
|
-
else:
|
|
188
|
-
return "finished", {}
|
|
189
|
-
|
|
190
|
-
# Parse action parameters
|
|
191
|
-
action_match = re.match(r'(\w+)\((.*)\)', action)
|
|
192
|
-
if not action_match:
|
|
193
|
-
logger.warning(f"Could not parse action: {action}")
|
|
194
|
-
return "", {}
|
|
195
|
-
|
|
196
|
-
action_name = action_match.group(1)
|
|
197
|
-
action_params_str = action_match.group(2)
|
|
198
|
-
|
|
199
|
-
tool_args = {"action": action_name}
|
|
200
|
-
|
|
201
|
-
# Extract coordinate values from the action
|
|
202
|
-
if "start_box" in action_params_str:
|
|
203
|
-
# Extract all box coordinates
|
|
204
|
-
box_pattern = r"(start_box|end_box)='(?:<\|box_start\|>)?\((\d+),\s*(\d+)\)(?:<\|box_end\|>)?'"
|
|
205
|
-
box_matches = re.findall(box_pattern, action_params_str)
|
|
206
|
-
|
|
207
|
-
# Handle click-type actions
|
|
208
|
-
if action_name in ["click", "left_double", "right_single"]:
|
|
209
|
-
# Get coordinates from start_box
|
|
210
|
-
for box_type, x, y in box_matches:
|
|
211
|
-
if box_type == "start_box":
|
|
212
|
-
tool_args["x"] = int(x)
|
|
213
|
-
tool_args["y"] = int(y)
|
|
214
|
-
break
|
|
215
|
-
|
|
216
|
-
# Handle drag action
|
|
217
|
-
elif action_name == "drag":
|
|
218
|
-
start_x, start_y = None, None
|
|
219
|
-
end_x, end_y = None, None
|
|
220
|
-
|
|
221
|
-
for box_type, x, y in box_matches:
|
|
222
|
-
if box_type == "start_box":
|
|
223
|
-
start_x, start_y = int(x), int(y)
|
|
224
|
-
elif box_type == "end_box":
|
|
225
|
-
end_x, end_y = int(x), int(y)
|
|
226
|
-
|
|
227
|
-
if not None in [start_x, start_y, end_x, end_y]:
|
|
228
|
-
tool_args["start_x"] = start_x
|
|
229
|
-
tool_args["start_y"] = start_y
|
|
230
|
-
tool_args["end_x"] = end_x
|
|
231
|
-
tool_args["end_y"] = end_y
|
|
232
|
-
|
|
233
|
-
# Handle scroll action
|
|
234
|
-
elif action_name == "scroll":
|
|
235
|
-
# Get coordinates from start_box
|
|
236
|
-
for box_type, x, y in box_matches:
|
|
237
|
-
if box_type == "start_box":
|
|
238
|
-
tool_args["x"] = int(x)
|
|
239
|
-
tool_args["y"] = int(y)
|
|
240
|
-
break
|
|
241
|
-
|
|
242
|
-
# Extract direction
|
|
243
|
-
direction_match = re.search(r"direction='([^']+)'", action_params_str)
|
|
244
|
-
if direction_match:
|
|
245
|
-
tool_args["direction"] = direction_match.group(1)
|
|
246
|
-
|
|
247
|
-
# Handle typing text
|
|
248
|
-
elif action_name == "type":
|
|
249
|
-
# Extract text content
|
|
250
|
-
content_match = re.search(r"content='([^']*)'", action_params_str)
|
|
251
|
-
if content_match:
|
|
252
|
-
# Unescape escaped characters
|
|
253
|
-
text = content_match.group(1).replace("\\'", "'").replace('\\"', '"').replace("\\n", "\n")
|
|
254
|
-
tool_args = {"action": "type_text", "text": text}
|
|
255
|
-
|
|
256
|
-
# Handle hotkey
|
|
257
|
-
elif action_name == "hotkey":
|
|
258
|
-
# Extract key combination
|
|
259
|
-
key_match = re.search(r"key='([^']*)'", action_params_str)
|
|
260
|
-
if key_match:
|
|
261
|
-
keys = key_match.group(1).split()
|
|
262
|
-
tool_args = {"action": "hotkey", "keys": keys}
|
|
263
|
-
|
|
264
|
-
return action_name, tool_args
|