cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py
CHANGED
|
@@ -4,39 +4,50 @@ Paper: https://arxiv.org/abs/2501.12326
|
|
|
4
4
|
Code: https://github.com/bytedance/UI-TARS
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import ast
|
|
7
8
|
import asyncio
|
|
8
|
-
from ctypes import cast
|
|
9
|
-
import json
|
|
10
9
|
import base64
|
|
10
|
+
import json
|
|
11
11
|
import math
|
|
12
12
|
import re
|
|
13
|
-
import
|
|
14
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
13
|
+
from ctypes import cast
|
|
15
14
|
from io import BytesIO
|
|
16
|
-
from
|
|
15
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
16
|
+
|
|
17
17
|
import litellm
|
|
18
|
-
from litellm.
|
|
19
|
-
|
|
18
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
19
|
+
LiteLLMCompletionResponsesConfig,
|
|
20
|
+
)
|
|
20
21
|
from litellm.responses.utils import Usage
|
|
21
|
-
from
|
|
22
|
+
from litellm.types.utils import ModelResponse
|
|
23
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
24
|
+
ActionType,
|
|
25
|
+
ResponseComputerToolCallParam,
|
|
26
|
+
)
|
|
22
27
|
from openai.types.responses.response_input_param import ComputerCallOutput
|
|
23
|
-
from openai.types.responses.response_output_message_param import
|
|
24
|
-
|
|
28
|
+
from openai.types.responses.response_output_message_param import (
|
|
29
|
+
ResponseOutputMessageParam,
|
|
30
|
+
)
|
|
31
|
+
from openai.types.responses.response_reasoning_item_param import (
|
|
32
|
+
ResponseReasoningItemParam,
|
|
33
|
+
Summary,
|
|
34
|
+
)
|
|
35
|
+
from PIL import Image
|
|
25
36
|
|
|
26
37
|
from ..decorators import register_agent
|
|
27
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
28
38
|
from ..responses import (
|
|
29
|
-
make_reasoning_item,
|
|
30
|
-
make_output_text_item,
|
|
31
39
|
make_click_item,
|
|
32
40
|
make_double_click_item,
|
|
33
41
|
make_drag_item,
|
|
42
|
+
make_input_image_item,
|
|
34
43
|
make_keypress_item,
|
|
44
|
+
make_output_text_item,
|
|
45
|
+
make_reasoning_item,
|
|
35
46
|
make_scroll_item,
|
|
36
47
|
make_type_item,
|
|
37
48
|
make_wait_item,
|
|
38
|
-
make_input_image_item
|
|
39
49
|
)
|
|
50
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
40
51
|
|
|
41
52
|
# Constants from reference code
|
|
42
53
|
IMAGE_FACTOR = 28
|
|
@@ -94,6 +105,7 @@ click(point='<|box_start|>(x1,y1)<|box_end|>')
|
|
|
94
105
|
## User Instruction
|
|
95
106
|
{instruction}"""
|
|
96
107
|
|
|
108
|
+
|
|
97
109
|
def round_by_factor(number: float, factor: int) -> int:
|
|
98
110
|
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
99
111
|
return round(number / factor) * factor
|
|
@@ -110,7 +122,11 @@ def floor_by_factor(number: float, factor: int) -> int:
|
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def smart_resize(
|
|
113
|
-
height: int,
|
|
125
|
+
height: int,
|
|
126
|
+
width: int,
|
|
127
|
+
factor: int = IMAGE_FACTOR,
|
|
128
|
+
min_pixels: int = MIN_PIXELS,
|
|
129
|
+
max_pixels: int = MAX_PIXELS,
|
|
114
130
|
) -> tuple[int, int]:
|
|
115
131
|
"""
|
|
116
132
|
Rescales the image so that the following conditions are met:
|
|
@@ -144,14 +160,14 @@ def escape_single_quotes(text):
|
|
|
144
160
|
def parse_action(action_str):
|
|
145
161
|
"""Parse action string into structured format."""
|
|
146
162
|
try:
|
|
147
|
-
node = ast.parse(action_str, mode=
|
|
163
|
+
node = ast.parse(action_str, mode="eval")
|
|
148
164
|
if not isinstance(node, ast.Expression):
|
|
149
165
|
raise ValueError("Not an expression")
|
|
150
|
-
|
|
166
|
+
|
|
151
167
|
call = node.body
|
|
152
168
|
if not isinstance(call, ast.Call):
|
|
153
169
|
raise ValueError("Not a function call")
|
|
154
|
-
|
|
170
|
+
|
|
155
171
|
# Get function name
|
|
156
172
|
if isinstance(call.func, ast.Name):
|
|
157
173
|
func_name = call.func.id
|
|
@@ -159,7 +175,7 @@ def parse_action(action_str):
|
|
|
159
175
|
func_name = call.func.attr
|
|
160
176
|
else:
|
|
161
177
|
func_name = None
|
|
162
|
-
|
|
178
|
+
|
|
163
179
|
# Get keyword arguments
|
|
164
180
|
kwargs = {}
|
|
165
181
|
for kw in call.keywords:
|
|
@@ -171,12 +187,9 @@ def parse_action(action_str):
|
|
|
171
187
|
else:
|
|
172
188
|
value = None
|
|
173
189
|
kwargs[key] = value
|
|
174
|
-
|
|
175
|
-
return {
|
|
176
|
-
|
|
177
|
-
'args': kwargs
|
|
178
|
-
}
|
|
179
|
-
|
|
190
|
+
|
|
191
|
+
return {"function": func_name, "args": kwargs}
|
|
192
|
+
|
|
180
193
|
except Exception as e:
|
|
181
194
|
print(f"Failed to parse action '{action_str}': {e}")
|
|
182
195
|
return None
|
|
@@ -185,39 +198,39 @@ def parse_action(action_str):
|
|
|
185
198
|
def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
|
|
186
199
|
"""Parse UITARS model response into structured actions."""
|
|
187
200
|
text = text.strip()
|
|
188
|
-
|
|
201
|
+
|
|
189
202
|
# Extract thought
|
|
190
203
|
thought = None
|
|
191
204
|
if text.startswith("Thought:"):
|
|
192
205
|
thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
|
|
193
206
|
if thought_match:
|
|
194
207
|
thought = thought_match.group(1).strip()
|
|
195
|
-
|
|
208
|
+
|
|
196
209
|
# Extract action
|
|
197
210
|
if "Action:" not in text:
|
|
198
211
|
raise ValueError("No Action found in response")
|
|
199
|
-
|
|
212
|
+
|
|
200
213
|
action_str = text.split("Action:")[-1].strip()
|
|
201
214
|
|
|
202
215
|
# Handle special case for type actions
|
|
203
216
|
if "type(content" in action_str:
|
|
217
|
+
|
|
204
218
|
def escape_quotes(match):
|
|
205
219
|
return match.group(1)
|
|
206
|
-
|
|
220
|
+
|
|
207
221
|
pattern = r"type\(content='(.*?)'\)"
|
|
208
222
|
content = re.sub(pattern, escape_quotes, action_str)
|
|
209
223
|
action_str = escape_single_quotes(content)
|
|
210
224
|
action_str = "type(content='" + action_str + "')"
|
|
211
|
-
|
|
212
|
-
|
|
225
|
+
|
|
213
226
|
# Parse the action
|
|
214
227
|
parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
|
|
215
228
|
if parsed_action is None:
|
|
216
229
|
raise ValueError(f"Action can't parse: {action_str}")
|
|
217
|
-
|
|
230
|
+
|
|
218
231
|
action_type = parsed_action["function"]
|
|
219
232
|
params = parsed_action["args"]
|
|
220
|
-
|
|
233
|
+
|
|
221
234
|
# Process parameters
|
|
222
235
|
action_inputs = {}
|
|
223
236
|
for param_name, param in params.items():
|
|
@@ -225,7 +238,7 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
|
|
|
225
238
|
continue
|
|
226
239
|
param = str(param).lstrip()
|
|
227
240
|
action_inputs[param_name.strip()] = param
|
|
228
|
-
|
|
241
|
+
|
|
229
242
|
# Handle coordinate parameters
|
|
230
243
|
if "start_box" in param_name or "end_box" in param_name:
|
|
231
244
|
# Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
|
|
@@ -233,117 +246,130 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
|
|
|
233
246
|
clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
|
|
234
247
|
# Then remove parentheses and split
|
|
235
248
|
numbers = clean_param.replace("(", "").replace(")", "").split(",")
|
|
236
|
-
|
|
249
|
+
|
|
237
250
|
try:
|
|
238
|
-
float_numbers = [
|
|
239
|
-
|
|
251
|
+
float_numbers = [
|
|
252
|
+
float(num.strip()) / 1000 for num in numbers
|
|
253
|
+
] # Normalize to 0-1 range
|
|
254
|
+
|
|
240
255
|
if len(float_numbers) == 2:
|
|
241
256
|
# Single point, duplicate for box format
|
|
242
|
-
float_numbers = [
|
|
243
|
-
|
|
257
|
+
float_numbers = [
|
|
258
|
+
float_numbers[0],
|
|
259
|
+
float_numbers[1],
|
|
260
|
+
float_numbers[0],
|
|
261
|
+
float_numbers[1],
|
|
262
|
+
]
|
|
263
|
+
|
|
244
264
|
action_inputs[param_name.strip()] = str(float_numbers)
|
|
245
265
|
except ValueError as e:
|
|
246
266
|
# If parsing fails, keep the original parameter value
|
|
247
267
|
print(f"Warning: Could not parse coordinates '{param}': {e}")
|
|
248
268
|
action_inputs[param_name.strip()] = param
|
|
249
|
-
|
|
250
|
-
return [
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
269
|
+
|
|
270
|
+
return [
|
|
271
|
+
{
|
|
272
|
+
"thought": thought,
|
|
273
|
+
"action_type": action_type,
|
|
274
|
+
"action_inputs": action_inputs,
|
|
275
|
+
"text": text,
|
|
276
|
+
}
|
|
277
|
+
]
|
|
256
278
|
|
|
257
279
|
|
|
258
|
-
def convert_to_computer_actions(
|
|
280
|
+
def convert_to_computer_actions(
|
|
281
|
+
parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
|
|
282
|
+
) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
|
|
259
283
|
"""Convert parsed UITARS responses to computer actions."""
|
|
260
284
|
computer_actions = []
|
|
261
|
-
|
|
285
|
+
|
|
262
286
|
for response in parsed_responses:
|
|
263
287
|
action_type = response.get("action_type")
|
|
264
288
|
action_inputs = response.get("action_inputs", {})
|
|
265
|
-
|
|
289
|
+
|
|
266
290
|
if action_type == "finished":
|
|
267
291
|
finished_text = action_inputs.get("content", "Task completed successfully.")
|
|
268
292
|
computer_actions.append(make_output_text_item(finished_text))
|
|
269
293
|
break
|
|
270
|
-
|
|
294
|
+
|
|
271
295
|
elif action_type == "wait":
|
|
272
296
|
computer_actions.append(make_wait_item())
|
|
273
|
-
|
|
297
|
+
|
|
274
298
|
elif action_type == "call_user":
|
|
275
|
-
computer_actions.append(
|
|
276
|
-
|
|
299
|
+
computer_actions.append(
|
|
300
|
+
make_output_text_item("I need assistance from the user to proceed with this task.")
|
|
301
|
+
)
|
|
302
|
+
|
|
277
303
|
elif action_type in ["click", "left_single"]:
|
|
278
304
|
start_box = action_inputs.get("start_box")
|
|
279
305
|
if start_box:
|
|
280
306
|
coords = eval(start_box)
|
|
281
307
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
282
308
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
283
|
-
|
|
309
|
+
|
|
284
310
|
computer_actions.append(make_click_item(x, y, "left"))
|
|
285
|
-
|
|
311
|
+
|
|
286
312
|
elif action_type == "double_click":
|
|
287
313
|
start_box = action_inputs.get("start_box")
|
|
288
314
|
if start_box:
|
|
289
315
|
coords = eval(start_box)
|
|
290
316
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
291
317
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
292
|
-
|
|
318
|
+
|
|
293
319
|
computer_actions.append(make_double_click_item(x, y))
|
|
294
|
-
|
|
320
|
+
|
|
295
321
|
elif action_type == "right_click":
|
|
296
322
|
start_box = action_inputs.get("start_box")
|
|
297
323
|
if start_box:
|
|
298
324
|
coords = eval(start_box)
|
|
299
325
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
300
326
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
301
|
-
|
|
327
|
+
|
|
302
328
|
computer_actions.append(make_click_item(x, y, "right"))
|
|
303
|
-
|
|
329
|
+
|
|
304
330
|
elif action_type == "type":
|
|
305
331
|
content = action_inputs.get("content", "")
|
|
306
332
|
computer_actions.append(make_type_item(content))
|
|
307
|
-
|
|
333
|
+
|
|
308
334
|
elif action_type == "hotkey":
|
|
309
335
|
key = action_inputs.get("key", "")
|
|
310
336
|
keys = key.split()
|
|
311
337
|
computer_actions.append(make_keypress_item(keys))
|
|
312
|
-
|
|
338
|
+
|
|
313
339
|
elif action_type == "press":
|
|
314
340
|
key = action_inputs.get("key", "")
|
|
315
341
|
computer_actions.append(make_keypress_item([key]))
|
|
316
|
-
|
|
342
|
+
|
|
317
343
|
elif action_type == "scroll":
|
|
318
344
|
start_box = action_inputs.get("start_box")
|
|
319
345
|
direction = action_inputs.get("direction", "down")
|
|
320
|
-
|
|
346
|
+
|
|
321
347
|
if start_box:
|
|
322
348
|
coords = eval(start_box)
|
|
323
349
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
324
350
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
325
351
|
else:
|
|
326
352
|
x, y = image_width // 2, image_height // 2
|
|
327
|
-
|
|
353
|
+
|
|
328
354
|
scroll_y = 5 if "up" in direction.lower() else -5
|
|
329
355
|
computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
|
|
330
|
-
|
|
356
|
+
|
|
331
357
|
elif action_type == "drag":
|
|
332
358
|
start_box = action_inputs.get("start_box")
|
|
333
359
|
end_box = action_inputs.get("end_box")
|
|
334
|
-
|
|
360
|
+
|
|
335
361
|
if start_box and end_box:
|
|
336
362
|
start_coords = eval(start_box)
|
|
337
363
|
end_coords = eval(end_box)
|
|
338
|
-
|
|
364
|
+
|
|
339
365
|
start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
|
|
340
366
|
start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
|
|
341
367
|
end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
|
|
342
368
|
end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
|
|
343
|
-
|
|
369
|
+
|
|
344
370
|
path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
345
371
|
computer_actions.append(make_drag_item(path))
|
|
346
|
-
|
|
372
|
+
|
|
347
373
|
return computer_actions
|
|
348
374
|
|
|
349
375
|
|
|
@@ -354,33 +380,35 @@ def pil_to_base64(image: Image.Image) -> str:
|
|
|
354
380
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
355
381
|
|
|
356
382
|
|
|
357
|
-
def process_image_for_uitars(
|
|
383
|
+
def process_image_for_uitars(
|
|
384
|
+
image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
|
|
385
|
+
) -> tuple[Image.Image, int, int]:
|
|
358
386
|
"""Process image for UITARS model input."""
|
|
359
387
|
# Decode base64 image
|
|
360
|
-
if image_data.startswith(
|
|
361
|
-
image_data = image_data.split(
|
|
362
|
-
|
|
388
|
+
if image_data.startswith("data:image"):
|
|
389
|
+
image_data = image_data.split(",")[1]
|
|
390
|
+
|
|
363
391
|
image_bytes = base64.b64decode(image_data)
|
|
364
392
|
image = Image.open(BytesIO(image_bytes))
|
|
365
|
-
|
|
393
|
+
|
|
366
394
|
original_width, original_height = image.size
|
|
367
|
-
|
|
395
|
+
|
|
368
396
|
# Resize image according to UITARS requirements
|
|
369
397
|
if image.width * image.height > max_pixels:
|
|
370
398
|
resize_factor = math.sqrt(max_pixels / (image.width * image.height))
|
|
371
399
|
width = int(image.width * resize_factor)
|
|
372
400
|
height = int(image.height * resize_factor)
|
|
373
401
|
image = image.resize((width, height))
|
|
374
|
-
|
|
402
|
+
|
|
375
403
|
if image.width * image.height < min_pixels:
|
|
376
404
|
resize_factor = math.sqrt(min_pixels / (image.width * image.height))
|
|
377
405
|
width = math.ceil(image.width * resize_factor)
|
|
378
406
|
height = math.ceil(image.height * resize_factor)
|
|
379
407
|
image = image.resize((width, height))
|
|
380
|
-
|
|
408
|
+
|
|
381
409
|
if image.mode != "RGB":
|
|
382
410
|
image = image.convert("RGB")
|
|
383
|
-
|
|
411
|
+
|
|
384
412
|
return image, original_width, original_height
|
|
385
413
|
|
|
386
414
|
|
|
@@ -391,7 +419,11 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
391
419
|
for key, value in msg.items():
|
|
392
420
|
if key == "content" and isinstance(value, list):
|
|
393
421
|
result[key] = [
|
|
394
|
-
|
|
422
|
+
(
|
|
423
|
+
{k: v for k, v in item.items() if k != "image_url"}
|
|
424
|
+
if isinstance(item, dict)
|
|
425
|
+
else item
|
|
426
|
+
)
|
|
395
427
|
for item in value
|
|
396
428
|
]
|
|
397
429
|
else:
|
|
@@ -406,38 +438,41 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
406
438
|
def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
|
|
407
439
|
"""
|
|
408
440
|
Convert UITARS internal message format back to LiteLLM format.
|
|
409
|
-
|
|
441
|
+
|
|
410
442
|
This function processes reasoning, computer_call, and computer_call_output messages
|
|
411
443
|
and converts them to the appropriate LiteLLM assistant message format.
|
|
412
|
-
|
|
444
|
+
|
|
413
445
|
Args:
|
|
414
446
|
messages: List of UITARS internal messages
|
|
415
|
-
|
|
447
|
+
|
|
416
448
|
Returns:
|
|
417
449
|
List of LiteLLM formatted messages
|
|
418
450
|
"""
|
|
419
451
|
litellm_messages = []
|
|
420
452
|
current_assistant_content = []
|
|
421
|
-
|
|
453
|
+
|
|
422
454
|
for message in messages:
|
|
423
455
|
if isinstance(message, dict):
|
|
424
456
|
message_type = message.get("type")
|
|
425
|
-
|
|
457
|
+
|
|
426
458
|
if message_type == "reasoning":
|
|
427
459
|
# Extract reasoning text from summary
|
|
428
460
|
summary = message.get("summary", [])
|
|
429
461
|
if summary and isinstance(summary, list):
|
|
430
462
|
for summary_item in summary:
|
|
431
|
-
if
|
|
463
|
+
if (
|
|
464
|
+
isinstance(summary_item, dict)
|
|
465
|
+
and summary_item.get("type") == "summary_text"
|
|
466
|
+
):
|
|
432
467
|
reasoning_text = summary_item.get("text", "")
|
|
433
468
|
if reasoning_text:
|
|
434
469
|
current_assistant_content.append(f"Thought: {reasoning_text}")
|
|
435
|
-
|
|
470
|
+
|
|
436
471
|
elif message_type == "computer_call":
|
|
437
472
|
# Convert computer action to UITARS action format
|
|
438
473
|
action = message.get("action", {})
|
|
439
474
|
action_type = action.get("type")
|
|
440
|
-
|
|
475
|
+
|
|
441
476
|
if action_type == "click":
|
|
442
477
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
443
478
|
button = action.get("button", "left")
|
|
@@ -447,59 +482,65 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
|
|
|
447
482
|
action_text = f"Action: right_single(start_box='({x},{y})')"
|
|
448
483
|
else:
|
|
449
484
|
action_text = f"Action: click(start_box='({x},{y})')"
|
|
450
|
-
|
|
485
|
+
|
|
451
486
|
elif action_type == "double_click":
|
|
452
487
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
453
488
|
action_text = f"Action: left_double(start_box='({x},{y})')"
|
|
454
|
-
|
|
489
|
+
|
|
455
490
|
elif action_type == "drag":
|
|
456
491
|
start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
|
|
457
492
|
end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
|
|
458
493
|
action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
|
|
459
|
-
|
|
494
|
+
|
|
460
495
|
elif action_type == "key":
|
|
461
496
|
key = action.get("key", "")
|
|
462
497
|
action_text = f"Action: hotkey(key='{key}')"
|
|
463
|
-
|
|
498
|
+
|
|
464
499
|
elif action_type == "type":
|
|
465
500
|
text = action.get("text", "")
|
|
466
501
|
# Escape single quotes in the text
|
|
467
502
|
escaped_text = escape_single_quotes(text)
|
|
468
503
|
action_text = f"Action: type(content='{escaped_text}')"
|
|
469
|
-
|
|
504
|
+
|
|
470
505
|
elif action_type == "scroll":
|
|
471
506
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
472
507
|
direction = action.get("direction", "down")
|
|
473
508
|
action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
|
|
474
|
-
|
|
509
|
+
|
|
475
510
|
elif action_type == "wait":
|
|
476
511
|
action_text = "Action: wait()"
|
|
477
|
-
|
|
512
|
+
|
|
478
513
|
else:
|
|
479
514
|
# Fallback for unknown action types
|
|
480
515
|
action_text = f"Action: {action_type}({action})"
|
|
481
|
-
|
|
516
|
+
|
|
482
517
|
current_assistant_content.append(action_text)
|
|
483
|
-
|
|
518
|
+
|
|
484
519
|
# When we hit a computer_call_output, finalize the current assistant message
|
|
485
520
|
if current_assistant_content:
|
|
486
|
-
litellm_messages.append(
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
521
|
+
litellm_messages.append(
|
|
522
|
+
{
|
|
523
|
+
"role": "assistant",
|
|
524
|
+
"content": [
|
|
525
|
+
{"type": "text", "text": "\n".join(current_assistant_content)}
|
|
526
|
+
],
|
|
527
|
+
}
|
|
528
|
+
)
|
|
490
529
|
current_assistant_content = []
|
|
491
|
-
|
|
530
|
+
|
|
492
531
|
elif message_type == "computer_call_output":
|
|
493
532
|
# Add screenshot from computer call output
|
|
494
533
|
output = message.get("output", {})
|
|
495
534
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
496
535
|
image_url = output.get("image_url", "")
|
|
497
536
|
if image_url:
|
|
498
|
-
litellm_messages.append(
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
537
|
+
litellm_messages.append(
|
|
538
|
+
{
|
|
539
|
+
"role": "user",
|
|
540
|
+
"content": [{"type": "image_url", "image_url": {"url": image_url}}],
|
|
541
|
+
}
|
|
542
|
+
)
|
|
543
|
+
|
|
503
544
|
elif message.get("role") == "user":
|
|
504
545
|
# # Handle user messages
|
|
505
546
|
# content = message.get("content", "")
|
|
@@ -514,24 +555,22 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
|
|
|
514
555
|
# "content": content
|
|
515
556
|
# })
|
|
516
557
|
pass
|
|
517
|
-
|
|
558
|
+
|
|
518
559
|
# Add any remaining assistant content
|
|
519
560
|
if current_assistant_content:
|
|
520
|
-
litellm_messages.append({
|
|
521
|
-
|
|
522
|
-
"content": current_assistant_content
|
|
523
|
-
})
|
|
524
|
-
|
|
561
|
+
litellm_messages.append({"role": "assistant", "content": current_assistant_content})
|
|
562
|
+
|
|
525
563
|
return litellm_messages
|
|
526
564
|
|
|
565
|
+
|
|
527
566
|
@register_agent(models=r"(?i).*ui-?tars.*")
|
|
528
567
|
class UITARSConfig:
|
|
529
568
|
"""
|
|
530
569
|
UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
|
|
531
|
-
|
|
570
|
+
|
|
532
571
|
Supports UITARS vision-language models for computer control.
|
|
533
572
|
"""
|
|
534
|
-
|
|
573
|
+
|
|
535
574
|
async def predict_step(
|
|
536
575
|
self,
|
|
537
576
|
messages: List[Dict[str, Any]],
|
|
@@ -545,11 +584,11 @@ class UITARSConfig:
|
|
|
545
584
|
_on_api_end=None,
|
|
546
585
|
_on_usage=None,
|
|
547
586
|
_on_screenshot=None,
|
|
548
|
-
**kwargs
|
|
587
|
+
**kwargs,
|
|
549
588
|
) -> Dict[str, Any]:
|
|
550
589
|
"""
|
|
551
590
|
Predict the next step based on input messages.
|
|
552
|
-
|
|
591
|
+
|
|
553
592
|
Args:
|
|
554
593
|
messages: Input messages following Responses format
|
|
555
594
|
model: Model name to use
|
|
@@ -562,22 +601,22 @@ class UITARSConfig:
|
|
|
562
601
|
_on_usage: Callback for usage tracking
|
|
563
602
|
_on_screenshot: Callback for screenshot events
|
|
564
603
|
**kwargs: Additional arguments
|
|
565
|
-
|
|
604
|
+
|
|
566
605
|
Returns:
|
|
567
606
|
Dictionary with "output" (output items) and "usage" array
|
|
568
607
|
"""
|
|
569
608
|
tools = tools or []
|
|
570
|
-
|
|
609
|
+
|
|
571
610
|
# Create response items
|
|
572
611
|
response_items = []
|
|
573
|
-
|
|
612
|
+
|
|
574
613
|
# Find computer tool for screen dimensions
|
|
575
614
|
computer_tool = None
|
|
576
615
|
for tool_schema in tools:
|
|
577
616
|
if tool_schema["type"] == "computer":
|
|
578
617
|
computer_tool = tool_schema["computer"]
|
|
579
618
|
break
|
|
580
|
-
|
|
619
|
+
|
|
581
620
|
# Get screen dimensions
|
|
582
621
|
screen_width, screen_height = 1024, 768
|
|
583
622
|
if computer_tool:
|
|
@@ -585,20 +624,20 @@ class UITARSConfig:
|
|
|
585
624
|
screen_width, screen_height = await computer_tool.get_dimensions()
|
|
586
625
|
except:
|
|
587
626
|
pass
|
|
588
|
-
|
|
627
|
+
|
|
589
628
|
# Process messages to extract instruction and image
|
|
590
629
|
instruction = ""
|
|
591
630
|
image_data = None
|
|
592
|
-
|
|
631
|
+
|
|
593
632
|
# Convert messages to list if string
|
|
594
633
|
if isinstance(messages, str):
|
|
595
634
|
messages = [{"role": "user", "content": messages}]
|
|
596
|
-
|
|
635
|
+
|
|
597
636
|
# Extract instruction and latest screenshot
|
|
598
637
|
for message in reversed(messages):
|
|
599
638
|
if isinstance(message, dict):
|
|
600
639
|
content = message.get("content", "")
|
|
601
|
-
|
|
640
|
+
|
|
602
641
|
# Handle different content formats
|
|
603
642
|
if isinstance(content, str):
|
|
604
643
|
if not instruction and message.get("role") == "user":
|
|
@@ -614,46 +653,41 @@ class UITARSConfig:
|
|
|
614
653
|
image_data = image_url.get("url", "")
|
|
615
654
|
else:
|
|
616
655
|
image_data = image_url
|
|
617
|
-
|
|
656
|
+
|
|
618
657
|
# Also check for computer_call_output with screenshots
|
|
619
658
|
if message.get("type") == "computer_call_output" and not image_data:
|
|
620
659
|
output = message.get("output", {})
|
|
621
660
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
622
661
|
image_data = output.get("image_url", "")
|
|
623
|
-
|
|
662
|
+
|
|
624
663
|
if instruction and image_data:
|
|
625
664
|
break
|
|
626
|
-
|
|
665
|
+
|
|
627
666
|
if not instruction:
|
|
628
|
-
instruction =
|
|
629
|
-
|
|
667
|
+
instruction = (
|
|
668
|
+
"Help me complete this task by analyzing the screen and taking appropriate actions."
|
|
669
|
+
)
|
|
670
|
+
|
|
630
671
|
# Create prompt
|
|
631
672
|
user_prompt = UITARS_PROMPT_TEMPLATE.format(
|
|
632
|
-
instruction=instruction,
|
|
633
|
-
action_space=UITARS_ACTION_SPACE,
|
|
634
|
-
language="English"
|
|
673
|
+
instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
|
|
635
674
|
)
|
|
636
|
-
|
|
675
|
+
|
|
637
676
|
# Convert conversation history to LiteLLM format
|
|
638
677
|
history_messages = convert_uitars_messages_to_litellm(messages)
|
|
639
|
-
|
|
678
|
+
|
|
640
679
|
# Prepare messages for liteLLM
|
|
641
|
-
litellm_messages = [
|
|
642
|
-
{
|
|
643
|
-
"role": "system",
|
|
644
|
-
"content": "You are a helpful assistant."
|
|
645
|
-
}
|
|
646
|
-
]
|
|
680
|
+
litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
|
647
681
|
|
|
648
682
|
# Add current user instruction with screenshot
|
|
649
683
|
current_user_message = {
|
|
650
|
-
"role": "user",
|
|
684
|
+
"role": "user",
|
|
651
685
|
"content": [
|
|
652
686
|
{"type": "text", "text": user_prompt},
|
|
653
|
-
]
|
|
687
|
+
],
|
|
654
688
|
}
|
|
655
689
|
litellm_messages.append(current_user_message)
|
|
656
|
-
|
|
690
|
+
|
|
657
691
|
# Process image for UITARS
|
|
658
692
|
if not image_data:
|
|
659
693
|
# Take screenshot if none found in messages
|
|
@@ -667,17 +701,22 @@ class UITARSConfig:
|
|
|
667
701
|
raise ValueError("No screenshot found in messages and no computer_handler provided")
|
|
668
702
|
processed_image, original_width, original_height = process_image_for_uitars(image_data)
|
|
669
703
|
encoded_image = pil_to_base64(processed_image)
|
|
670
|
-
|
|
704
|
+
|
|
671
705
|
# Add conversation history
|
|
672
706
|
if history_messages:
|
|
673
707
|
litellm_messages.extend(history_messages)
|
|
674
708
|
else:
|
|
675
|
-
litellm_messages.append(
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
709
|
+
litellm_messages.append(
|
|
710
|
+
{
|
|
711
|
+
"role": "user",
|
|
712
|
+
"content": [
|
|
713
|
+
{
|
|
714
|
+
"type": "image_url",
|
|
715
|
+
"image_url": {"url": f"data:image/png;base64,{encoded_image}"},
|
|
716
|
+
}
|
|
717
|
+
],
|
|
718
|
+
}
|
|
719
|
+
)
|
|
681
720
|
|
|
682
721
|
# Prepare API call kwargs
|
|
683
722
|
api_kwargs = {
|
|
@@ -687,146 +726,142 @@ class UITARSConfig:
|
|
|
687
726
|
"temperature": kwargs.get("temperature", 0.0),
|
|
688
727
|
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
|
689
728
|
"num_retries": max_retries,
|
|
690
|
-
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
|
|
729
|
+
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
|
|
691
730
|
}
|
|
692
|
-
|
|
731
|
+
|
|
693
732
|
# Call API start hook
|
|
694
733
|
if _on_api_start:
|
|
695
734
|
await _on_api_start(api_kwargs)
|
|
696
|
-
|
|
735
|
+
|
|
697
736
|
# Call liteLLM with UITARS model
|
|
698
737
|
response = await litellm.acompletion(**api_kwargs)
|
|
699
|
-
|
|
738
|
+
|
|
700
739
|
# Call API end hook
|
|
701
740
|
if _on_api_end:
|
|
702
741
|
await _on_api_end(api_kwargs, response)
|
|
703
|
-
|
|
742
|
+
|
|
704
743
|
# Extract response content
|
|
705
|
-
response_content = response.choices[0].message.content.strip()
|
|
706
|
-
|
|
744
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
745
|
+
|
|
707
746
|
# Parse UITARS response
|
|
708
747
|
parsed_responses = parse_uitars_response(response_content, original_width, original_height)
|
|
709
|
-
|
|
748
|
+
|
|
710
749
|
# Convert to computer actions
|
|
711
|
-
computer_actions = convert_to_computer_actions(
|
|
712
|
-
|
|
750
|
+
computer_actions = convert_to_computer_actions(
|
|
751
|
+
parsed_responses, original_width, original_height
|
|
752
|
+
)
|
|
753
|
+
|
|
713
754
|
# Add computer actions to response items
|
|
714
755
|
thought = parsed_responses[0].get("thought", "")
|
|
715
756
|
if thought:
|
|
716
757
|
response_items.append(make_reasoning_item(thought))
|
|
717
758
|
response_items.extend(computer_actions)
|
|
718
|
-
|
|
759
|
+
|
|
719
760
|
# Extract usage information
|
|
720
761
|
response_usage = {
|
|
721
|
-
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
762
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
763
|
+
response.usage
|
|
764
|
+
).model_dump(),
|
|
722
765
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
723
766
|
}
|
|
724
767
|
if _on_usage:
|
|
725
768
|
await _on_usage(response_usage)
|
|
726
769
|
|
|
727
770
|
# Create agent response
|
|
728
|
-
agent_response = {
|
|
729
|
-
|
|
730
|
-
"usage": response_usage
|
|
731
|
-
}
|
|
732
|
-
|
|
771
|
+
agent_response = {"output": response_items, "usage": response_usage}
|
|
772
|
+
|
|
733
773
|
return agent_response
|
|
734
|
-
|
|
774
|
+
|
|
735
775
|
async def predict_click(
|
|
736
|
-
self,
|
|
737
|
-
model: str,
|
|
738
|
-
image_b64: str,
|
|
739
|
-
instruction: str
|
|
776
|
+
self, model: str, image_b64: str, instruction: str
|
|
740
777
|
) -> Optional[Tuple[int, int]]:
|
|
741
778
|
"""
|
|
742
779
|
Predict click coordinates based on image and instruction.
|
|
743
|
-
|
|
780
|
+
|
|
744
781
|
UITARS supports click prediction through its action parsing.
|
|
745
|
-
|
|
782
|
+
|
|
746
783
|
Args:
|
|
747
784
|
model: Model name to use
|
|
748
785
|
image_b64: Base64 encoded image
|
|
749
786
|
instruction: Instruction for where to click
|
|
750
|
-
|
|
787
|
+
|
|
751
788
|
Returns:
|
|
752
789
|
Tuple with (x, y) coordinates or None
|
|
753
790
|
"""
|
|
754
791
|
try:
|
|
755
792
|
# Create prompt using grounding template
|
|
756
|
-
user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
|
|
757
|
-
|
|
758
|
-
)
|
|
759
|
-
|
|
793
|
+
user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
|
|
794
|
+
|
|
760
795
|
# Process image for UITARS
|
|
761
796
|
processed_image, original_width, original_height = process_image_for_uitars(image_b64)
|
|
762
797
|
encoded_image = pil_to_base64(processed_image)
|
|
763
|
-
|
|
798
|
+
|
|
764
799
|
# Prepare messages for liteLLM
|
|
765
800
|
litellm_messages = [
|
|
766
|
-
{
|
|
767
|
-
"role": "system",
|
|
768
|
-
"content": "You are a helpful assistant."
|
|
769
|
-
},
|
|
801
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
770
802
|
{
|
|
771
803
|
"role": "user",
|
|
772
804
|
"content": [
|
|
773
805
|
{"type": "text", "text": user_prompt},
|
|
774
|
-
{
|
|
775
|
-
|
|
776
|
-
|
|
806
|
+
{
|
|
807
|
+
"type": "image_url",
|
|
808
|
+
"image_url": {"url": f"data:image/png;base64,{encoded_image}"},
|
|
809
|
+
},
|
|
810
|
+
],
|
|
811
|
+
},
|
|
777
812
|
]
|
|
778
|
-
|
|
813
|
+
|
|
779
814
|
# Prepare API call kwargs
|
|
780
815
|
api_kwargs = {
|
|
781
816
|
"model": model,
|
|
782
817
|
"messages": litellm_messages,
|
|
783
818
|
"max_tokens": 2056,
|
|
784
819
|
"temperature": 0.0,
|
|
785
|
-
"do_sample": False
|
|
820
|
+
"do_sample": False,
|
|
786
821
|
}
|
|
787
|
-
|
|
822
|
+
|
|
788
823
|
# Call liteLLM with UITARS model
|
|
789
824
|
response = await litellm.acompletion(**api_kwargs)
|
|
790
|
-
|
|
825
|
+
|
|
791
826
|
# Extract response content
|
|
792
|
-
response_content = response.choices[0].message.content.strip()
|
|
793
|
-
|
|
827
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
828
|
+
|
|
794
829
|
print(response_content)
|
|
795
830
|
|
|
796
831
|
# Parse the response to extract click coordinates
|
|
797
832
|
# Look for click action with coordinates (with special tokens)
|
|
798
833
|
click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
|
|
799
834
|
match = re.search(click_pattern, response_content)
|
|
800
|
-
|
|
835
|
+
|
|
801
836
|
# Fallback: Look for simpler format without special tokens
|
|
802
837
|
if not match:
|
|
803
838
|
# Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
|
|
804
839
|
fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
|
|
805
840
|
match = re.search(fallback_pattern, response_content)
|
|
806
|
-
|
|
841
|
+
|
|
807
842
|
if match:
|
|
808
843
|
x, y = int(match.group(1)), int(match.group(2))
|
|
809
844
|
# Scale coordinates back to original image dimensions
|
|
810
845
|
scale_x = original_width / processed_image.width
|
|
811
846
|
scale_y = original_height / processed_image.height
|
|
812
|
-
|
|
847
|
+
|
|
813
848
|
scaled_x = int(x * scale_x)
|
|
814
849
|
scaled_y = int(y * scale_y)
|
|
815
|
-
|
|
850
|
+
|
|
816
851
|
return (scaled_x, scaled_y)
|
|
817
|
-
|
|
852
|
+
|
|
818
853
|
return None
|
|
819
|
-
|
|
854
|
+
|
|
820
855
|
except Exception as e:
|
|
821
856
|
# Log error and return None
|
|
822
857
|
print(f"Error in predict_click: {e}")
|
|
823
858
|
return None
|
|
824
|
-
|
|
859
|
+
|
|
825
860
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
826
861
|
"""
|
|
827
862
|
Get list of capabilities supported by this agent config.
|
|
828
|
-
|
|
863
|
+
|
|
829
864
|
Returns:
|
|
830
865
|
List of capability strings
|
|
831
866
|
"""
|
|
832
|
-
return ["step", "click"]
|
|
867
|
+
return ["step", "click"]
|