cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +15 -51
- agent/__main__.py +21 -0
- agent/adapters/__init__.py +9 -0
- agent/adapters/huggingfacelocal_adapter.py +216 -0
- agent/agent.py +577 -0
- agent/callbacks/__init__.py +17 -0
- agent/callbacks/base.py +153 -0
- agent/callbacks/budget_manager.py +44 -0
- agent/callbacks/image_retention.py +139 -0
- agent/callbacks/logging.py +247 -0
- agent/callbacks/pii_anonymization.py +259 -0
- agent/callbacks/trajectory_saver.py +305 -0
- agent/cli.py +290 -0
- agent/computer_handler.py +107 -0
- agent/decorators.py +90 -0
- agent/loops/__init__.py +11 -0
- agent/loops/anthropic.py +728 -0
- agent/loops/omniparser.py +339 -0
- agent/loops/openai.py +95 -0
- agent/loops/uitars.py +688 -0
- agent/responses.py +207 -0
- agent/types.py +79 -0
- agent/ui/__init__.py +7 -1
- agent/ui/gradio/__init__.py +6 -19
- agent/ui/gradio/app.py +80 -1299
- agent/ui/gradio/ui_components.py +703 -0
- cua_agent-0.4.0b1.dist-info/METADATA +424 -0
- cua_agent-0.4.0b1.dist-info/RECORD +30 -0
- agent/core/__init__.py +0 -27
- agent/core/agent.py +0 -210
- agent/core/base.py +0 -217
- agent/core/callbacks.py +0 -200
- agent/core/experiment.py +0 -249
- agent/core/factory.py +0 -122
- agent/core/messages.py +0 -332
- agent/core/provider_config.py +0 -21
- agent/core/telemetry.py +0 -142
- agent/core/tools/__init__.py +0 -21
- agent/core/tools/base.py +0 -74
- agent/core/tools/bash.py +0 -52
- agent/core/tools/collection.py +0 -46
- agent/core/tools/computer.py +0 -113
- agent/core/tools/edit.py +0 -67
- agent/core/tools/manager.py +0 -56
- agent/core/tools.py +0 -32
- agent/core/types.py +0 -88
- agent/core/visualization.py +0 -197
- agent/providers/__init__.py +0 -4
- agent/providers/anthropic/__init__.py +0 -6
- agent/providers/anthropic/api/client.py +0 -360
- agent/providers/anthropic/api/logging.py +0 -150
- agent/providers/anthropic/api_handler.py +0 -140
- agent/providers/anthropic/callbacks/__init__.py +0 -5
- agent/providers/anthropic/callbacks/manager.py +0 -65
- agent/providers/anthropic/loop.py +0 -568
- agent/providers/anthropic/prompts.py +0 -23
- agent/providers/anthropic/response_handler.py +0 -226
- agent/providers/anthropic/tools/__init__.py +0 -33
- agent/providers/anthropic/tools/base.py +0 -88
- agent/providers/anthropic/tools/bash.py +0 -66
- agent/providers/anthropic/tools/collection.py +0 -34
- agent/providers/anthropic/tools/computer.py +0 -396
- agent/providers/anthropic/tools/edit.py +0 -326
- agent/providers/anthropic/tools/manager.py +0 -54
- agent/providers/anthropic/tools/run.py +0 -42
- agent/providers/anthropic/types.py +0 -16
- agent/providers/anthropic/utils.py +0 -381
- agent/providers/omni/__init__.py +0 -8
- agent/providers/omni/api_handler.py +0 -42
- agent/providers/omni/clients/anthropic.py +0 -103
- agent/providers/omni/clients/base.py +0 -35
- agent/providers/omni/clients/oaicompat.py +0 -195
- agent/providers/omni/clients/ollama.py +0 -122
- agent/providers/omni/clients/openai.py +0 -155
- agent/providers/omni/clients/utils.py +0 -25
- agent/providers/omni/image_utils.py +0 -34
- agent/providers/omni/loop.py +0 -990
- agent/providers/omni/parser.py +0 -307
- agent/providers/omni/prompts.py +0 -64
- agent/providers/omni/tools/__init__.py +0 -30
- agent/providers/omni/tools/base.py +0 -29
- agent/providers/omni/tools/bash.py +0 -74
- agent/providers/omni/tools/computer.py +0 -179
- agent/providers/omni/tools/manager.py +0 -61
- agent/providers/omni/utils.py +0 -236
- agent/providers/openai/__init__.py +0 -6
- agent/providers/openai/api_handler.py +0 -456
- agent/providers/openai/loop.py +0 -472
- agent/providers/openai/response_handler.py +0 -205
- agent/providers/openai/tools/__init__.py +0 -15
- agent/providers/openai/tools/base.py +0 -79
- agent/providers/openai/tools/computer.py +0 -326
- agent/providers/openai/tools/manager.py +0 -106
- agent/providers/openai/types.py +0 -36
- agent/providers/openai/utils.py +0 -98
- agent/providers/uitars/__init__.py +0 -1
- agent/providers/uitars/clients/base.py +0 -35
- agent/providers/uitars/clients/mlxvlm.py +0 -263
- agent/providers/uitars/clients/oaicompat.py +0 -214
- agent/providers/uitars/loop.py +0 -660
- agent/providers/uitars/prompts.py +0 -63
- agent/providers/uitars/tools/__init__.py +0 -1
- agent/providers/uitars/tools/computer.py +0 -283
- agent/providers/uitars/tools/manager.py +0 -60
- agent/providers/uitars/utils.py +0 -264
- agent/telemetry.py +0 -21
- agent/ui/__main__.py +0 -15
- cua_agent-0.3.2.dist-info/METADATA +0 -295
- cua_agent-0.3.2.dist-info/RECORD +0 -87
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py
ADDED
|
@@ -0,0 +1,688 @@
|
|
|
1
|
+
"""
|
|
2
|
+
UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from ctypes import cast
|
|
7
|
+
import json
|
|
8
|
+
import base64
|
|
9
|
+
import math
|
|
10
|
+
import re
|
|
11
|
+
import ast
|
|
12
|
+
from typing import Dict, List, Any, AsyncGenerator, Union, Optional
|
|
13
|
+
from io import BytesIO
|
|
14
|
+
from PIL import Image
|
|
15
|
+
import litellm
|
|
16
|
+
from litellm.types.utils import ModelResponse
|
|
17
|
+
from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
|
|
18
|
+
from litellm.responses.utils import Usage
|
|
19
|
+
from openai.types.responses.response_computer_tool_call_param import ActionType, ResponseComputerToolCallParam
|
|
20
|
+
from openai.types.responses.response_input_param import ComputerCallOutput
|
|
21
|
+
from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
|
|
22
|
+
from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
|
|
23
|
+
|
|
24
|
+
from ..decorators import agent_loop
|
|
25
|
+
from ..types import Messages, AgentResponse, Tools
|
|
26
|
+
from ..responses import (
|
|
27
|
+
make_reasoning_item,
|
|
28
|
+
make_output_text_item,
|
|
29
|
+
make_click_item,
|
|
30
|
+
make_double_click_item,
|
|
31
|
+
make_drag_item,
|
|
32
|
+
make_keypress_item,
|
|
33
|
+
make_scroll_item,
|
|
34
|
+
make_type_item,
|
|
35
|
+
make_wait_item,
|
|
36
|
+
make_input_image_item
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Constants from reference code
|
|
40
|
+
IMAGE_FACTOR = 28
|
|
41
|
+
MIN_PIXELS = 100 * 28 * 28
|
|
42
|
+
MAX_PIXELS = 16384 * 28 * 28
|
|
43
|
+
MAX_RATIO = 200
|
|
44
|
+
|
|
45
|
+
FINISH_WORD = "finished"
|
|
46
|
+
WAIT_WORD = "wait"
|
|
47
|
+
ENV_FAIL_WORD = "error_env"
|
|
48
|
+
CALL_USER = "call_user"
|
|
49
|
+
|
|
50
|
+
# Action space prompt for UITARS
|
|
51
|
+
UITARS_ACTION_SPACE = """
|
|
52
|
+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
53
|
+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
54
|
+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
|
55
|
+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
|
56
|
+
hotkey(key='')
|
|
57
|
+
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
|
|
58
|
+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
|
59
|
+
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
60
|
+
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
64
|
+
|
|
65
|
+
## Output Format
|
|
66
|
+
```
|
|
67
|
+
Thought: ...
|
|
68
|
+
Action: ...
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Action Space
|
|
72
|
+
{action_space}
|
|
73
|
+
|
|
74
|
+
## Note
|
|
75
|
+
- Use {language} in `Thought` part.
|
|
76
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
|
77
|
+
|
|
78
|
+
## User Instruction
|
|
79
|
+
{instruction}
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def round_by_factor(number: float, factor: int) -> int:
|
|
84
|
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
85
|
+
return round(number / factor) * factor
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def ceil_by_factor(number: float, factor: int) -> int:
|
|
89
|
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
|
90
|
+
return math.ceil(number / factor) * factor
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def floor_by_factor(number: float, factor: int) -> int:
|
|
94
|
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
|
95
|
+
return math.floor(number / factor) * factor
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def smart_resize(
|
|
99
|
+
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
|
100
|
+
) -> tuple[int, int]:
|
|
101
|
+
"""
|
|
102
|
+
Rescales the image so that the following conditions are met:
|
|
103
|
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
|
104
|
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
|
105
|
+
3. The aspect ratio of the image is maintained as closely as possible.
|
|
106
|
+
"""
|
|
107
|
+
if max(height, width) / min(height, width) > MAX_RATIO:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
|
110
|
+
)
|
|
111
|
+
h_bar = max(factor, round_by_factor(height, factor))
|
|
112
|
+
w_bar = max(factor, round_by_factor(width, factor))
|
|
113
|
+
if h_bar * w_bar > max_pixels:
|
|
114
|
+
beta = math.sqrt((height * width) / max_pixels)
|
|
115
|
+
h_bar = floor_by_factor(height / beta, factor)
|
|
116
|
+
w_bar = floor_by_factor(width / beta, factor)
|
|
117
|
+
elif h_bar * w_bar < min_pixels:
|
|
118
|
+
beta = math.sqrt(min_pixels / (height * width))
|
|
119
|
+
h_bar = ceil_by_factor(height * beta, factor)
|
|
120
|
+
w_bar = ceil_by_factor(width * beta, factor)
|
|
121
|
+
return h_bar, w_bar
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def escape_single_quotes(text):
|
|
125
|
+
"""Escape single quotes in text for safe string formatting."""
|
|
126
|
+
pattern = r"(?<!\\)'"
|
|
127
|
+
return re.sub(pattern, r"\\'", text)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def parse_action(action_str):
|
|
131
|
+
"""Parse action string into structured format."""
|
|
132
|
+
try:
|
|
133
|
+
node = ast.parse(action_str, mode='eval')
|
|
134
|
+
if not isinstance(node, ast.Expression):
|
|
135
|
+
raise ValueError("Not an expression")
|
|
136
|
+
|
|
137
|
+
call = node.body
|
|
138
|
+
if not isinstance(call, ast.Call):
|
|
139
|
+
raise ValueError("Not a function call")
|
|
140
|
+
|
|
141
|
+
# Get function name
|
|
142
|
+
if isinstance(call.func, ast.Name):
|
|
143
|
+
func_name = call.func.id
|
|
144
|
+
elif isinstance(call.func, ast.Attribute):
|
|
145
|
+
func_name = call.func.attr
|
|
146
|
+
else:
|
|
147
|
+
func_name = None
|
|
148
|
+
|
|
149
|
+
# Get keyword arguments
|
|
150
|
+
kwargs = {}
|
|
151
|
+
for kw in call.keywords:
|
|
152
|
+
key = kw.arg
|
|
153
|
+
if isinstance(kw.value, ast.Constant):
|
|
154
|
+
value = kw.value.value
|
|
155
|
+
elif isinstance(kw.value, ast.Str): # Compatibility with older Python
|
|
156
|
+
value = kw.value.s
|
|
157
|
+
else:
|
|
158
|
+
value = None
|
|
159
|
+
kwargs[key] = value
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
'function': func_name,
|
|
163
|
+
'args': kwargs
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
print(f"Failed to parse action '{action_str}': {e}")
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
|
|
172
|
+
"""Parse UITARS model response into structured actions."""
|
|
173
|
+
text = text.strip()
|
|
174
|
+
|
|
175
|
+
# Extract thought
|
|
176
|
+
thought = None
|
|
177
|
+
if text.startswith("Thought:"):
|
|
178
|
+
thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
|
|
179
|
+
if thought_match:
|
|
180
|
+
thought = thought_match.group(1).strip()
|
|
181
|
+
|
|
182
|
+
# Extract action
|
|
183
|
+
if "Action:" not in text:
|
|
184
|
+
raise ValueError("No Action found in response")
|
|
185
|
+
|
|
186
|
+
action_str = text.split("Action:")[-1].strip()
|
|
187
|
+
|
|
188
|
+
# Handle special case for type actions
|
|
189
|
+
if "type(content" in action_str:
|
|
190
|
+
def escape_quotes(match):
|
|
191
|
+
return match.group(1)
|
|
192
|
+
|
|
193
|
+
pattern = r"type\(content='(.*?)'\)"
|
|
194
|
+
content = re.sub(pattern, escape_quotes, action_str)
|
|
195
|
+
action_str = escape_single_quotes(content)
|
|
196
|
+
action_str = "type(content='" + action_str + "')"
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# Parse the action
|
|
200
|
+
parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
|
|
201
|
+
if parsed_action is None:
|
|
202
|
+
raise ValueError(f"Action can't parse: {action_str}")
|
|
203
|
+
|
|
204
|
+
action_type = parsed_action["function"]
|
|
205
|
+
params = parsed_action["args"]
|
|
206
|
+
|
|
207
|
+
# Process parameters
|
|
208
|
+
action_inputs = {}
|
|
209
|
+
for param_name, param in params.items():
|
|
210
|
+
if param == "":
|
|
211
|
+
continue
|
|
212
|
+
param = str(param).lstrip()
|
|
213
|
+
action_inputs[param_name.strip()] = param
|
|
214
|
+
|
|
215
|
+
# Handle coordinate parameters
|
|
216
|
+
if "start_box" in param_name or "end_box" in param_name:
|
|
217
|
+
# Parse coordinates like '(x,y)' or '(x1,y1,x2,y2)'
|
|
218
|
+
numbers = param.replace("(", "").replace(")", "").split(",")
|
|
219
|
+
float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
|
|
220
|
+
|
|
221
|
+
if len(float_numbers) == 2:
|
|
222
|
+
# Single point, duplicate for box format
|
|
223
|
+
float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
|
|
224
|
+
|
|
225
|
+
action_inputs[param_name.strip()] = str(float_numbers)
|
|
226
|
+
|
|
227
|
+
return [{
|
|
228
|
+
"thought": thought,
|
|
229
|
+
"action_type": action_type,
|
|
230
|
+
"action_inputs": action_inputs,
|
|
231
|
+
"text": text
|
|
232
|
+
}]
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def convert_to_computer_actions(parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
|
|
236
|
+
"""Convert parsed UITARS responses to computer actions."""
|
|
237
|
+
computer_actions = []
|
|
238
|
+
|
|
239
|
+
for response in parsed_responses:
|
|
240
|
+
action_type = response.get("action_type")
|
|
241
|
+
action_inputs = response.get("action_inputs", {})
|
|
242
|
+
|
|
243
|
+
if action_type == "finished":
|
|
244
|
+
finished_text = action_inputs.get("content", "Task completed successfully.")
|
|
245
|
+
computer_actions.append(make_output_text_item(finished_text))
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
elif action_type == "wait":
|
|
249
|
+
computer_actions.append(make_wait_item())
|
|
250
|
+
|
|
251
|
+
elif action_type == "call_user":
|
|
252
|
+
computer_actions.append(make_output_text_item("I need assistance from the user to proceed with this task."))
|
|
253
|
+
|
|
254
|
+
elif action_type in ["click", "left_single"]:
|
|
255
|
+
start_box = action_inputs.get("start_box")
|
|
256
|
+
if start_box:
|
|
257
|
+
coords = eval(start_box)
|
|
258
|
+
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
259
|
+
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
260
|
+
|
|
261
|
+
computer_actions.append(make_click_item(x, y, "left"))
|
|
262
|
+
|
|
263
|
+
elif action_type == "double_click":
|
|
264
|
+
start_box = action_inputs.get("start_box")
|
|
265
|
+
if start_box:
|
|
266
|
+
coords = eval(start_box)
|
|
267
|
+
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
268
|
+
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
269
|
+
|
|
270
|
+
computer_actions.append(make_double_click_item(x, y))
|
|
271
|
+
|
|
272
|
+
elif action_type == "right_click":
|
|
273
|
+
start_box = action_inputs.get("start_box")
|
|
274
|
+
if start_box:
|
|
275
|
+
coords = eval(start_box)
|
|
276
|
+
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
277
|
+
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
278
|
+
|
|
279
|
+
computer_actions.append(make_click_item(x, y, "right"))
|
|
280
|
+
|
|
281
|
+
elif action_type == "type":
|
|
282
|
+
content = action_inputs.get("content", "")
|
|
283
|
+
computer_actions.append(make_type_item(content))
|
|
284
|
+
|
|
285
|
+
elif action_type == "hotkey":
|
|
286
|
+
key = action_inputs.get("key", "")
|
|
287
|
+
keys = key.split()
|
|
288
|
+
computer_actions.append(make_keypress_item(keys))
|
|
289
|
+
|
|
290
|
+
elif action_type == "press":
|
|
291
|
+
key = action_inputs.get("key", "")
|
|
292
|
+
computer_actions.append(make_keypress_item([key]))
|
|
293
|
+
|
|
294
|
+
elif action_type == "scroll":
|
|
295
|
+
start_box = action_inputs.get("start_box")
|
|
296
|
+
direction = action_inputs.get("direction", "down")
|
|
297
|
+
|
|
298
|
+
if start_box:
|
|
299
|
+
coords = eval(start_box)
|
|
300
|
+
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
301
|
+
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
302
|
+
else:
|
|
303
|
+
x, y = image_width // 2, image_height // 2
|
|
304
|
+
|
|
305
|
+
scroll_y = 5 if "up" in direction.lower() else -5
|
|
306
|
+
computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
|
|
307
|
+
|
|
308
|
+
elif action_type == "drag":
|
|
309
|
+
start_box = action_inputs.get("start_box")
|
|
310
|
+
end_box = action_inputs.get("end_box")
|
|
311
|
+
|
|
312
|
+
if start_box and end_box:
|
|
313
|
+
start_coords = eval(start_box)
|
|
314
|
+
end_coords = eval(end_box)
|
|
315
|
+
|
|
316
|
+
start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
|
|
317
|
+
start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
|
|
318
|
+
end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
|
|
319
|
+
end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
|
|
320
|
+
|
|
321
|
+
path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
322
|
+
computer_actions.append(make_drag_item(path))
|
|
323
|
+
|
|
324
|
+
return computer_actions
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def pil_to_base64(image: Image.Image) -> str:
|
|
328
|
+
"""Convert PIL image to base64 string."""
|
|
329
|
+
buffer = BytesIO()
|
|
330
|
+
image.save(buffer, format="PNG")
|
|
331
|
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def process_image_for_uitars(image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS) -> tuple[Image.Image, int, int]:
|
|
335
|
+
"""Process image for UITARS model input."""
|
|
336
|
+
# Decode base64 image
|
|
337
|
+
if image_data.startswith('data:image'):
|
|
338
|
+
image_data = image_data.split(',')[1]
|
|
339
|
+
|
|
340
|
+
image_bytes = base64.b64decode(image_data)
|
|
341
|
+
image = Image.open(BytesIO(image_bytes))
|
|
342
|
+
|
|
343
|
+
original_width, original_height = image.size
|
|
344
|
+
|
|
345
|
+
# Resize image according to UITARS requirements
|
|
346
|
+
if image.width * image.height > max_pixels:
|
|
347
|
+
resize_factor = math.sqrt(max_pixels / (image.width * image.height))
|
|
348
|
+
width = int(image.width * resize_factor)
|
|
349
|
+
height = int(image.height * resize_factor)
|
|
350
|
+
image = image.resize((width, height))
|
|
351
|
+
|
|
352
|
+
if image.width * image.height < min_pixels:
|
|
353
|
+
resize_factor = math.sqrt(min_pixels / (image.width * image.height))
|
|
354
|
+
width = math.ceil(image.width * resize_factor)
|
|
355
|
+
height = math.ceil(image.height * resize_factor)
|
|
356
|
+
image = image.resize((width, height))
|
|
357
|
+
|
|
358
|
+
if image.mode != "RGB":
|
|
359
|
+
image = image.convert("RGB")
|
|
360
|
+
|
|
361
|
+
return image, original_width, original_height
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def sanitize_message(msg: Any) -> Any:
|
|
365
|
+
"""Return a copy of the message with image_url ommited within content parts"""
|
|
366
|
+
if isinstance(msg, dict):
|
|
367
|
+
result = {}
|
|
368
|
+
for key, value in msg.items():
|
|
369
|
+
if key == "content" and isinstance(value, list):
|
|
370
|
+
result[key] = [
|
|
371
|
+
{k: v for k, v in item.items() if k != "image_url"} if isinstance(item, dict) else item
|
|
372
|
+
for item in value
|
|
373
|
+
]
|
|
374
|
+
else:
|
|
375
|
+
result[key] = value
|
|
376
|
+
return result
|
|
377
|
+
elif isinstance(msg, list):
|
|
378
|
+
return [sanitize_message(item) for item in msg]
|
|
379
|
+
else:
|
|
380
|
+
return msg
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
|
|
384
|
+
"""
|
|
385
|
+
Convert UITARS internal message format back to LiteLLM format.
|
|
386
|
+
|
|
387
|
+
This function processes reasoning, computer_call, and computer_call_output messages
|
|
388
|
+
and converts them to the appropriate LiteLLM assistant message format.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
messages: List of UITARS internal messages
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
List of LiteLLM formatted messages
|
|
395
|
+
"""
|
|
396
|
+
litellm_messages = []
|
|
397
|
+
current_assistant_content = []
|
|
398
|
+
|
|
399
|
+
for message in messages:
|
|
400
|
+
if isinstance(message, dict):
|
|
401
|
+
message_type = message.get("type")
|
|
402
|
+
|
|
403
|
+
if message_type == "reasoning":
|
|
404
|
+
# Extract reasoning text from summary
|
|
405
|
+
summary = message.get("summary", [])
|
|
406
|
+
if summary and isinstance(summary, list):
|
|
407
|
+
for summary_item in summary:
|
|
408
|
+
if isinstance(summary_item, dict) and summary_item.get("type") == "summary_text":
|
|
409
|
+
reasoning_text = summary_item.get("text", "")
|
|
410
|
+
if reasoning_text:
|
|
411
|
+
current_assistant_content.append(f"Thought: {reasoning_text}")
|
|
412
|
+
|
|
413
|
+
elif message_type == "computer_call":
|
|
414
|
+
# Convert computer action to UITARS action format
|
|
415
|
+
action = message.get("action", {})
|
|
416
|
+
action_type = action.get("type")
|
|
417
|
+
|
|
418
|
+
if action_type == "click":
|
|
419
|
+
x, y = action.get("x", 0), action.get("y", 0)
|
|
420
|
+
button = action.get("button", "left")
|
|
421
|
+
if button == "left":
|
|
422
|
+
action_text = f"Action: click(start_box='({x},{y})')"
|
|
423
|
+
elif button == "right":
|
|
424
|
+
action_text = f"Action: right_single(start_box='({x},{y})')"
|
|
425
|
+
else:
|
|
426
|
+
action_text = f"Action: click(start_box='({x},{y})')"
|
|
427
|
+
|
|
428
|
+
elif action_type == "double_click":
|
|
429
|
+
x, y = action.get("x", 0), action.get("y", 0)
|
|
430
|
+
action_text = f"Action: left_double(start_box='({x},{y})')"
|
|
431
|
+
|
|
432
|
+
elif action_type == "drag":
|
|
433
|
+
start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
|
|
434
|
+
end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
|
|
435
|
+
action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
|
|
436
|
+
|
|
437
|
+
elif action_type == "key":
|
|
438
|
+
key = action.get("key", "")
|
|
439
|
+
action_text = f"Action: hotkey(key='{key}')"
|
|
440
|
+
|
|
441
|
+
elif action_type == "type":
|
|
442
|
+
text = action.get("text", "")
|
|
443
|
+
# Escape single quotes in the text
|
|
444
|
+
escaped_text = escape_single_quotes(text)
|
|
445
|
+
action_text = f"Action: type(content='{escaped_text}')"
|
|
446
|
+
|
|
447
|
+
elif action_type == "scroll":
|
|
448
|
+
x, y = action.get("x", 0), action.get("y", 0)
|
|
449
|
+
direction = action.get("direction", "down")
|
|
450
|
+
action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
|
|
451
|
+
|
|
452
|
+
elif action_type == "wait":
|
|
453
|
+
action_text = "Action: wait()"
|
|
454
|
+
|
|
455
|
+
else:
|
|
456
|
+
# Fallback for unknown action types
|
|
457
|
+
action_text = f"Action: {action_type}({action})"
|
|
458
|
+
|
|
459
|
+
current_assistant_content.append(action_text)
|
|
460
|
+
|
|
461
|
+
# When we hit a computer_call_output, finalize the current assistant message
|
|
462
|
+
if current_assistant_content:
|
|
463
|
+
litellm_messages.append({
|
|
464
|
+
"role": "assistant",
|
|
465
|
+
"content": [{"type": "text", "text": "\n".join(current_assistant_content)}]
|
|
466
|
+
})
|
|
467
|
+
current_assistant_content = []
|
|
468
|
+
|
|
469
|
+
elif message_type == "computer_call_output":
|
|
470
|
+
# Add screenshot from computer call output
|
|
471
|
+
output = message.get("output", {})
|
|
472
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
473
|
+
image_url = output.get("image_url", "")
|
|
474
|
+
if image_url:
|
|
475
|
+
litellm_messages.append({
|
|
476
|
+
"role": "user",
|
|
477
|
+
"content": [{"type": "image_url", "image_url": {"url": image_url}}]
|
|
478
|
+
})
|
|
479
|
+
|
|
480
|
+
elif message.get("role") == "user":
|
|
481
|
+
# # Handle user messages
|
|
482
|
+
# content = message.get("content", "")
|
|
483
|
+
# if isinstance(content, str):
|
|
484
|
+
# litellm_messages.append({
|
|
485
|
+
# "role": "user",
|
|
486
|
+
# "content": content
|
|
487
|
+
# })
|
|
488
|
+
# elif isinstance(content, list):
|
|
489
|
+
# litellm_messages.append({
|
|
490
|
+
# "role": "user",
|
|
491
|
+
# "content": content
|
|
492
|
+
# })
|
|
493
|
+
pass
|
|
494
|
+
|
|
495
|
+
# Add any remaining assistant content
|
|
496
|
+
if current_assistant_content:
|
|
497
|
+
litellm_messages.append({
|
|
498
|
+
"role": "assistant",
|
|
499
|
+
"content": current_assistant_content
|
|
500
|
+
})
|
|
501
|
+
|
|
502
|
+
return litellm_messages
|
|
503
|
+
|
|
504
|
+
@agent_loop(models=r"(?i).*ui-?tars.*", priority=10)
|
|
505
|
+
async def uitars_loop(
|
|
506
|
+
messages: Messages,
|
|
507
|
+
model: str,
|
|
508
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
509
|
+
max_retries: Optional[int] = None,
|
|
510
|
+
stream: bool = False,
|
|
511
|
+
computer_handler=None,
|
|
512
|
+
use_prompt_caching: Optional[bool] = False,
|
|
513
|
+
_on_api_start=None,
|
|
514
|
+
_on_api_end=None,
|
|
515
|
+
_on_usage=None,
|
|
516
|
+
_on_screenshot=None,
|
|
517
|
+
**kwargs
|
|
518
|
+
) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
|
|
519
|
+
"""
|
|
520
|
+
UITARS agent loop using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
|
|
521
|
+
|
|
522
|
+
Supports UITARS vision-language models for computer control.
|
|
523
|
+
"""
|
|
524
|
+
tools = tools or []
|
|
525
|
+
|
|
526
|
+
# Create response items
|
|
527
|
+
response_items = []
|
|
528
|
+
|
|
529
|
+
# Find computer tool for screen dimensions
|
|
530
|
+
computer_tool = None
|
|
531
|
+
for tool_schema in tools:
|
|
532
|
+
if tool_schema["type"] == "computer":
|
|
533
|
+
computer_tool = tool_schema["computer"]
|
|
534
|
+
break
|
|
535
|
+
|
|
536
|
+
# Get screen dimensions
|
|
537
|
+
screen_width, screen_height = 1024, 768
|
|
538
|
+
if computer_tool:
|
|
539
|
+
try:
|
|
540
|
+
screen_width, screen_height = await computer_tool.get_dimensions()
|
|
541
|
+
except:
|
|
542
|
+
pass
|
|
543
|
+
|
|
544
|
+
# Process messages to extract instruction and image
|
|
545
|
+
instruction = ""
|
|
546
|
+
image_data = None
|
|
547
|
+
|
|
548
|
+
# Convert messages to list if string
|
|
549
|
+
if isinstance(messages, str):
|
|
550
|
+
messages = [{"role": "user", "content": messages}]
|
|
551
|
+
|
|
552
|
+
# Extract instruction and latest screenshot
|
|
553
|
+
for message in reversed(messages):
|
|
554
|
+
if isinstance(message, dict):
|
|
555
|
+
content = message.get("content", "")
|
|
556
|
+
|
|
557
|
+
# Handle different content formats
|
|
558
|
+
if isinstance(content, str):
|
|
559
|
+
if not instruction and message.get("role") == "user":
|
|
560
|
+
instruction = content
|
|
561
|
+
elif isinstance(content, list):
|
|
562
|
+
for item in content:
|
|
563
|
+
if isinstance(item, dict):
|
|
564
|
+
if item.get("type") == "text" and not instruction:
|
|
565
|
+
instruction = item.get("text", "")
|
|
566
|
+
elif item.get("type") == "image_url" and not image_data:
|
|
567
|
+
image_url = item.get("image_url", {})
|
|
568
|
+
if isinstance(image_url, dict):
|
|
569
|
+
image_data = image_url.get("url", "")
|
|
570
|
+
else:
|
|
571
|
+
image_data = image_url
|
|
572
|
+
|
|
573
|
+
# Also check for computer_call_output with screenshots
|
|
574
|
+
if message.get("type") == "computer_call_output" and not image_data:
|
|
575
|
+
output = message.get("output", {})
|
|
576
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
577
|
+
image_data = output.get("image_url", "")
|
|
578
|
+
|
|
579
|
+
if instruction and image_data:
|
|
580
|
+
break
|
|
581
|
+
|
|
582
|
+
if not instruction:
|
|
583
|
+
instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
|
|
584
|
+
|
|
585
|
+
# Create prompt
|
|
586
|
+
user_prompt = UITARS_PROMPT_TEMPLATE.format(
|
|
587
|
+
instruction=instruction,
|
|
588
|
+
action_space=UITARS_ACTION_SPACE,
|
|
589
|
+
language="English"
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# Convert conversation history to LiteLLM format
|
|
593
|
+
history_messages = convert_uitars_messages_to_litellm(messages)
|
|
594
|
+
|
|
595
|
+
# Prepare messages for liteLLM
|
|
596
|
+
litellm_messages = [
|
|
597
|
+
{
|
|
598
|
+
"role": "system",
|
|
599
|
+
"content": "You are a helpful assistant."
|
|
600
|
+
}
|
|
601
|
+
]
|
|
602
|
+
|
|
603
|
+
# Add current user instruction with screenshot
|
|
604
|
+
current_user_message = {
|
|
605
|
+
"role": "user",
|
|
606
|
+
"content": [
|
|
607
|
+
{"type": "text", "text": user_prompt},
|
|
608
|
+
]
|
|
609
|
+
}
|
|
610
|
+
litellm_messages.append(current_user_message)
|
|
611
|
+
|
|
612
|
+
# Process image for UITARS
|
|
613
|
+
if not image_data:
|
|
614
|
+
# Take screenshot if none found in messages
|
|
615
|
+
if computer_handler:
|
|
616
|
+
image_data = await computer_handler.screenshot()
|
|
617
|
+
await _on_screenshot(image_data, "screenshot_before")
|
|
618
|
+
|
|
619
|
+
# Add screenshot to output items so it can be retained in history
|
|
620
|
+
response_items.append(make_input_image_item(image_data))
|
|
621
|
+
else:
|
|
622
|
+
raise ValueError("No screenshot found in messages and no computer_handler provided")
|
|
623
|
+
processed_image, original_width, original_height = process_image_for_uitars(image_data)
|
|
624
|
+
encoded_image = pil_to_base64(processed_image)
|
|
625
|
+
|
|
626
|
+
# Add conversation history
|
|
627
|
+
if history_messages:
|
|
628
|
+
litellm_messages.extend(history_messages)
|
|
629
|
+
else:
|
|
630
|
+
litellm_messages.append({
|
|
631
|
+
"role": "user",
|
|
632
|
+
"content": [
|
|
633
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
|
|
634
|
+
]
|
|
635
|
+
})
|
|
636
|
+
|
|
637
|
+
# Prepare API call kwargs
|
|
638
|
+
api_kwargs = {
|
|
639
|
+
"model": model,
|
|
640
|
+
"messages": litellm_messages,
|
|
641
|
+
"max_tokens": kwargs.get("max_tokens", 500),
|
|
642
|
+
"temperature": kwargs.get("temperature", 0.0),
|
|
643
|
+
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
|
644
|
+
"num_retries": max_retries,
|
|
645
|
+
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
# Call API start hook
|
|
649
|
+
if _on_api_start:
|
|
650
|
+
await _on_api_start(api_kwargs)
|
|
651
|
+
|
|
652
|
+
# Call liteLLM with UITARS model
|
|
653
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
654
|
+
|
|
655
|
+
# Call API end hook
|
|
656
|
+
if _on_api_end:
|
|
657
|
+
await _on_api_end(api_kwargs, response)
|
|
658
|
+
|
|
659
|
+
# Extract response content
|
|
660
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
661
|
+
|
|
662
|
+
# Parse UITARS response
|
|
663
|
+
parsed_responses = parse_uitars_response(response_content, original_width, original_height)
|
|
664
|
+
|
|
665
|
+
# Convert to computer actions
|
|
666
|
+
computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
|
|
667
|
+
|
|
668
|
+
# Add computer actions to response items
|
|
669
|
+
thought = parsed_responses[0].get("thought", "")
|
|
670
|
+
if thought:
|
|
671
|
+
response_items.append(make_reasoning_item(thought))
|
|
672
|
+
response_items.extend(computer_actions)
|
|
673
|
+
|
|
674
|
+
# Extract usage information
|
|
675
|
+
response_usage = {
|
|
676
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
|
|
677
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
678
|
+
}
|
|
679
|
+
if _on_usage:
|
|
680
|
+
await _on_usage(response_usage)
|
|
681
|
+
|
|
682
|
+
# Create agent response
|
|
683
|
+
agent_response = {
|
|
684
|
+
"output": response_items,
|
|
685
|
+
"usage": response_usage
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
return agent_response
|