cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py
CHANGED
|
@@ -4,39 +4,50 @@ Paper: https://arxiv.org/abs/2501.12326
|
|
|
4
4
|
Code: https://github.com/bytedance/UI-TARS
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import ast
|
|
7
8
|
import asyncio
|
|
8
|
-
from ctypes import cast
|
|
9
|
-
import json
|
|
10
9
|
import base64
|
|
10
|
+
import json
|
|
11
11
|
import math
|
|
12
12
|
import re
|
|
13
|
-
import
|
|
14
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
13
|
+
from ctypes import cast
|
|
15
14
|
from io import BytesIO
|
|
16
|
-
from
|
|
15
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
16
|
+
|
|
17
17
|
import litellm
|
|
18
|
-
from litellm.
|
|
19
|
-
|
|
18
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
19
|
+
LiteLLMCompletionResponsesConfig,
|
|
20
|
+
)
|
|
20
21
|
from litellm.responses.utils import Usage
|
|
21
|
-
from
|
|
22
|
+
from litellm.types.utils import ModelResponse
|
|
23
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
24
|
+
ActionType,
|
|
25
|
+
ResponseComputerToolCallParam,
|
|
26
|
+
)
|
|
22
27
|
from openai.types.responses.response_input_param import ComputerCallOutput
|
|
23
|
-
from openai.types.responses.response_output_message_param import
|
|
24
|
-
|
|
28
|
+
from openai.types.responses.response_output_message_param import (
|
|
29
|
+
ResponseOutputMessageParam,
|
|
30
|
+
)
|
|
31
|
+
from openai.types.responses.response_reasoning_item_param import (
|
|
32
|
+
ResponseReasoningItemParam,
|
|
33
|
+
Summary,
|
|
34
|
+
)
|
|
35
|
+
from PIL import Image
|
|
25
36
|
|
|
26
37
|
from ..decorators import register_agent
|
|
27
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
28
38
|
from ..responses import (
|
|
29
|
-
make_reasoning_item,
|
|
30
|
-
make_output_text_item,
|
|
31
39
|
make_click_item,
|
|
32
40
|
make_double_click_item,
|
|
33
41
|
make_drag_item,
|
|
42
|
+
make_input_image_item,
|
|
34
43
|
make_keypress_item,
|
|
44
|
+
make_output_text_item,
|
|
45
|
+
make_reasoning_item,
|
|
35
46
|
make_scroll_item,
|
|
36
47
|
make_type_item,
|
|
37
48
|
make_wait_item,
|
|
38
|
-
make_input_image_item
|
|
39
49
|
)
|
|
50
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
40
51
|
|
|
41
52
|
# Constants from reference code
|
|
42
53
|
IMAGE_FACTOR = 28
|
|
@@ -94,6 +105,7 @@ click(point='<|box_start|>(x1,y1)<|box_end|>')
|
|
|
94
105
|
## User Instruction
|
|
95
106
|
{instruction}"""
|
|
96
107
|
|
|
108
|
+
|
|
97
109
|
def round_by_factor(number: float, factor: int) -> int:
|
|
98
110
|
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
99
111
|
return round(number / factor) * factor
|
|
@@ -110,7 +122,11 @@ def floor_by_factor(number: float, factor: int) -> int:
|
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def smart_resize(
|
|
113
|
-
height: int,
|
|
125
|
+
height: int,
|
|
126
|
+
width: int,
|
|
127
|
+
factor: int = IMAGE_FACTOR,
|
|
128
|
+
min_pixels: int = MIN_PIXELS,
|
|
129
|
+
max_pixels: int = MAX_PIXELS,
|
|
114
130
|
) -> tuple[int, int]:
|
|
115
131
|
"""
|
|
116
132
|
Rescales the image so that the following conditions are met:
|
|
@@ -144,14 +160,14 @@ def escape_single_quotes(text):
|
|
|
144
160
|
def parse_action(action_str):
|
|
145
161
|
"""Parse action string into structured format."""
|
|
146
162
|
try:
|
|
147
|
-
node = ast.parse(action_str, mode=
|
|
163
|
+
node = ast.parse(action_str, mode="eval")
|
|
148
164
|
if not isinstance(node, ast.Expression):
|
|
149
165
|
raise ValueError("Not an expression")
|
|
150
|
-
|
|
166
|
+
|
|
151
167
|
call = node.body
|
|
152
168
|
if not isinstance(call, ast.Call):
|
|
153
169
|
raise ValueError("Not a function call")
|
|
154
|
-
|
|
170
|
+
|
|
155
171
|
# Get function name
|
|
156
172
|
if isinstance(call.func, ast.Name):
|
|
157
173
|
func_name = call.func.id
|
|
@@ -159,7 +175,7 @@ def parse_action(action_str):
|
|
|
159
175
|
func_name = call.func.attr
|
|
160
176
|
else:
|
|
161
177
|
func_name = None
|
|
162
|
-
|
|
178
|
+
|
|
163
179
|
# Get keyword arguments
|
|
164
180
|
kwargs = {}
|
|
165
181
|
for kw in call.keywords:
|
|
@@ -171,12 +187,9 @@ def parse_action(action_str):
|
|
|
171
187
|
else:
|
|
172
188
|
value = None
|
|
173
189
|
kwargs[key] = value
|
|
174
|
-
|
|
175
|
-
return {
|
|
176
|
-
|
|
177
|
-
'args': kwargs
|
|
178
|
-
}
|
|
179
|
-
|
|
190
|
+
|
|
191
|
+
return {"function": func_name, "args": kwargs}
|
|
192
|
+
|
|
180
193
|
except Exception as e:
|
|
181
194
|
print(f"Failed to parse action '{action_str}': {e}")
|
|
182
195
|
return None
|
|
@@ -185,39 +198,39 @@ def parse_action(action_str):
|
|
|
185
198
|
def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
|
|
186
199
|
"""Parse UITARS model response into structured actions."""
|
|
187
200
|
text = text.strip()
|
|
188
|
-
|
|
201
|
+
|
|
189
202
|
# Extract thought
|
|
190
203
|
thought = None
|
|
191
204
|
if text.startswith("Thought:"):
|
|
192
205
|
thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
|
|
193
206
|
if thought_match:
|
|
194
207
|
thought = thought_match.group(1).strip()
|
|
195
|
-
|
|
208
|
+
|
|
196
209
|
# Extract action
|
|
197
210
|
if "Action:" not in text:
|
|
198
211
|
raise ValueError("No Action found in response")
|
|
199
|
-
|
|
212
|
+
|
|
200
213
|
action_str = text.split("Action:")[-1].strip()
|
|
201
214
|
|
|
202
215
|
# Handle special case for type actions
|
|
203
216
|
if "type(content" in action_str:
|
|
217
|
+
|
|
204
218
|
def escape_quotes(match):
|
|
205
219
|
return match.group(1)
|
|
206
|
-
|
|
220
|
+
|
|
207
221
|
pattern = r"type\(content='(.*?)'\)"
|
|
208
222
|
content = re.sub(pattern, escape_quotes, action_str)
|
|
209
223
|
action_str = escape_single_quotes(content)
|
|
210
224
|
action_str = "type(content='" + action_str + "')"
|
|
211
|
-
|
|
212
|
-
|
|
225
|
+
|
|
213
226
|
# Parse the action
|
|
214
227
|
parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
|
|
215
228
|
if parsed_action is None:
|
|
216
229
|
raise ValueError(f"Action can't parse: {action_str}")
|
|
217
|
-
|
|
230
|
+
|
|
218
231
|
action_type = parsed_action["function"]
|
|
219
232
|
params = parsed_action["args"]
|
|
220
|
-
|
|
233
|
+
|
|
221
234
|
# Process parameters
|
|
222
235
|
action_inputs = {}
|
|
223
236
|
for param_name, param in params.items():
|
|
@@ -225,116 +238,138 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
|
|
|
225
238
|
continue
|
|
226
239
|
param = str(param).lstrip()
|
|
227
240
|
action_inputs[param_name.strip()] = param
|
|
228
|
-
|
|
241
|
+
|
|
229
242
|
# Handle coordinate parameters
|
|
230
243
|
if "start_box" in param_name or "end_box" in param_name:
|
|
231
|
-
# Parse coordinates like '(x,y)' or '(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
244
|
+
# Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
|
|
245
|
+
# First, remove special tokens
|
|
246
|
+
clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
|
|
247
|
+
# Then remove parentheses and split
|
|
248
|
+
numbers = clean_param.replace("(", "").replace(")", "").split(",")
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
float_numbers = [
|
|
252
|
+
float(num.strip()) / 1000 for num in numbers
|
|
253
|
+
] # Normalize to 0-1 range
|
|
254
|
+
|
|
255
|
+
if len(float_numbers) == 2:
|
|
256
|
+
# Single point, duplicate for box format
|
|
257
|
+
float_numbers = [
|
|
258
|
+
float_numbers[0],
|
|
259
|
+
float_numbers[1],
|
|
260
|
+
float_numbers[0],
|
|
261
|
+
float_numbers[1],
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
action_inputs[param_name.strip()] = str(float_numbers)
|
|
265
|
+
except ValueError as e:
|
|
266
|
+
# If parsing fails, keep the original parameter value
|
|
267
|
+
print(f"Warning: Could not parse coordinates '{param}': {e}")
|
|
268
|
+
action_inputs[param_name.strip()] = param
|
|
269
|
+
|
|
270
|
+
return [
|
|
271
|
+
{
|
|
272
|
+
"thought": thought,
|
|
273
|
+
"action_type": action_type,
|
|
274
|
+
"action_inputs": action_inputs,
|
|
275
|
+
"text": text,
|
|
276
|
+
}
|
|
277
|
+
]
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def convert_to_computer_actions(
|
|
281
|
+
parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
|
|
282
|
+
) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
|
|
250
283
|
"""Convert parsed UITARS responses to computer actions."""
|
|
251
284
|
computer_actions = []
|
|
252
|
-
|
|
285
|
+
|
|
253
286
|
for response in parsed_responses:
|
|
254
287
|
action_type = response.get("action_type")
|
|
255
288
|
action_inputs = response.get("action_inputs", {})
|
|
256
|
-
|
|
289
|
+
|
|
257
290
|
if action_type == "finished":
|
|
258
291
|
finished_text = action_inputs.get("content", "Task completed successfully.")
|
|
259
292
|
computer_actions.append(make_output_text_item(finished_text))
|
|
260
293
|
break
|
|
261
|
-
|
|
294
|
+
|
|
262
295
|
elif action_type == "wait":
|
|
263
296
|
computer_actions.append(make_wait_item())
|
|
264
|
-
|
|
297
|
+
|
|
265
298
|
elif action_type == "call_user":
|
|
266
|
-
computer_actions.append(
|
|
267
|
-
|
|
299
|
+
computer_actions.append(
|
|
300
|
+
make_output_text_item("I need assistance from the user to proceed with this task.")
|
|
301
|
+
)
|
|
302
|
+
|
|
268
303
|
elif action_type in ["click", "left_single"]:
|
|
269
304
|
start_box = action_inputs.get("start_box")
|
|
270
305
|
if start_box:
|
|
271
306
|
coords = eval(start_box)
|
|
272
307
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
273
308
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
274
|
-
|
|
309
|
+
|
|
275
310
|
computer_actions.append(make_click_item(x, y, "left"))
|
|
276
|
-
|
|
277
|
-
elif action_type
|
|
311
|
+
|
|
312
|
+
elif action_type in ["double_click", "left_double"]:
|
|
278
313
|
start_box = action_inputs.get("start_box")
|
|
279
314
|
if start_box:
|
|
280
315
|
coords = eval(start_box)
|
|
281
316
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
282
317
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
283
|
-
|
|
318
|
+
|
|
284
319
|
computer_actions.append(make_double_click_item(x, y))
|
|
285
|
-
|
|
286
|
-
elif action_type
|
|
320
|
+
|
|
321
|
+
elif action_type in ["right_click", "right_single"]:
|
|
287
322
|
start_box = action_inputs.get("start_box")
|
|
288
323
|
if start_box:
|
|
289
324
|
coords = eval(start_box)
|
|
290
325
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
291
326
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
292
|
-
|
|
327
|
+
|
|
293
328
|
computer_actions.append(make_click_item(x, y, "right"))
|
|
294
|
-
|
|
329
|
+
|
|
295
330
|
elif action_type == "type":
|
|
296
331
|
content = action_inputs.get("content", "")
|
|
297
332
|
computer_actions.append(make_type_item(content))
|
|
298
|
-
|
|
333
|
+
|
|
299
334
|
elif action_type == "hotkey":
|
|
300
335
|
key = action_inputs.get("key", "")
|
|
301
336
|
keys = key.split()
|
|
302
337
|
computer_actions.append(make_keypress_item(keys))
|
|
303
|
-
|
|
338
|
+
|
|
304
339
|
elif action_type == "press":
|
|
305
340
|
key = action_inputs.get("key", "")
|
|
306
341
|
computer_actions.append(make_keypress_item([key]))
|
|
307
|
-
|
|
342
|
+
|
|
308
343
|
elif action_type == "scroll":
|
|
309
344
|
start_box = action_inputs.get("start_box")
|
|
310
345
|
direction = action_inputs.get("direction", "down")
|
|
311
|
-
|
|
346
|
+
|
|
312
347
|
if start_box:
|
|
313
348
|
coords = eval(start_box)
|
|
314
349
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
315
350
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
316
351
|
else:
|
|
317
352
|
x, y = image_width // 2, image_height // 2
|
|
318
|
-
|
|
353
|
+
|
|
319
354
|
scroll_y = 5 if "up" in direction.lower() else -5
|
|
320
355
|
computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
|
|
321
|
-
|
|
356
|
+
|
|
322
357
|
elif action_type == "drag":
|
|
323
358
|
start_box = action_inputs.get("start_box")
|
|
324
359
|
end_box = action_inputs.get("end_box")
|
|
325
|
-
|
|
360
|
+
|
|
326
361
|
if start_box and end_box:
|
|
327
362
|
start_coords = eval(start_box)
|
|
328
363
|
end_coords = eval(end_box)
|
|
329
|
-
|
|
364
|
+
|
|
330
365
|
start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
|
|
331
366
|
start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
|
|
332
367
|
end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
|
|
333
368
|
end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
|
|
334
|
-
|
|
369
|
+
|
|
335
370
|
path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
336
371
|
computer_actions.append(make_drag_item(path))
|
|
337
|
-
|
|
372
|
+
|
|
338
373
|
return computer_actions
|
|
339
374
|
|
|
340
375
|
|
|
@@ -345,33 +380,35 @@ def pil_to_base64(image: Image.Image) -> str:
|
|
|
345
380
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
346
381
|
|
|
347
382
|
|
|
348
|
-
def process_image_for_uitars(
|
|
383
|
+
def process_image_for_uitars(
|
|
384
|
+
image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
|
|
385
|
+
) -> tuple[Image.Image, int, int]:
|
|
349
386
|
"""Process image for UITARS model input."""
|
|
350
387
|
# Decode base64 image
|
|
351
|
-
if image_data.startswith(
|
|
352
|
-
image_data = image_data.split(
|
|
353
|
-
|
|
388
|
+
if image_data.startswith("data:image"):
|
|
389
|
+
image_data = image_data.split(",")[1]
|
|
390
|
+
|
|
354
391
|
image_bytes = base64.b64decode(image_data)
|
|
355
392
|
image = Image.open(BytesIO(image_bytes))
|
|
356
|
-
|
|
393
|
+
|
|
357
394
|
original_width, original_height = image.size
|
|
358
|
-
|
|
395
|
+
|
|
359
396
|
# Resize image according to UITARS requirements
|
|
360
397
|
if image.width * image.height > max_pixels:
|
|
361
398
|
resize_factor = math.sqrt(max_pixels / (image.width * image.height))
|
|
362
399
|
width = int(image.width * resize_factor)
|
|
363
400
|
height = int(image.height * resize_factor)
|
|
364
401
|
image = image.resize((width, height))
|
|
365
|
-
|
|
402
|
+
|
|
366
403
|
if image.width * image.height < min_pixels:
|
|
367
404
|
resize_factor = math.sqrt(min_pixels / (image.width * image.height))
|
|
368
405
|
width = math.ceil(image.width * resize_factor)
|
|
369
406
|
height = math.ceil(image.height * resize_factor)
|
|
370
407
|
image = image.resize((width, height))
|
|
371
|
-
|
|
408
|
+
|
|
372
409
|
if image.mode != "RGB":
|
|
373
410
|
image = image.convert("RGB")
|
|
374
|
-
|
|
411
|
+
|
|
375
412
|
return image, original_width, original_height
|
|
376
413
|
|
|
377
414
|
|
|
@@ -382,7 +419,11 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
382
419
|
for key, value in msg.items():
|
|
383
420
|
if key == "content" and isinstance(value, list):
|
|
384
421
|
result[key] = [
|
|
385
|
-
|
|
422
|
+
(
|
|
423
|
+
{k: v for k, v in item.items() if k != "image_url"}
|
|
424
|
+
if isinstance(item, dict)
|
|
425
|
+
else item
|
|
426
|
+
)
|
|
386
427
|
for item in value
|
|
387
428
|
]
|
|
388
429
|
else:
|
|
@@ -397,38 +438,41 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
397
438
|
def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
|
|
398
439
|
"""
|
|
399
440
|
Convert UITARS internal message format back to LiteLLM format.
|
|
400
|
-
|
|
441
|
+
|
|
401
442
|
This function processes reasoning, computer_call, and computer_call_output messages
|
|
402
443
|
and converts them to the appropriate LiteLLM assistant message format.
|
|
403
|
-
|
|
444
|
+
|
|
404
445
|
Args:
|
|
405
446
|
messages: List of UITARS internal messages
|
|
406
|
-
|
|
447
|
+
|
|
407
448
|
Returns:
|
|
408
449
|
List of LiteLLM formatted messages
|
|
409
450
|
"""
|
|
410
451
|
litellm_messages = []
|
|
411
452
|
current_assistant_content = []
|
|
412
|
-
|
|
453
|
+
|
|
413
454
|
for message in messages:
|
|
414
455
|
if isinstance(message, dict):
|
|
415
456
|
message_type = message.get("type")
|
|
416
|
-
|
|
457
|
+
|
|
417
458
|
if message_type == "reasoning":
|
|
418
459
|
# Extract reasoning text from summary
|
|
419
460
|
summary = message.get("summary", [])
|
|
420
461
|
if summary and isinstance(summary, list):
|
|
421
462
|
for summary_item in summary:
|
|
422
|
-
if
|
|
463
|
+
if (
|
|
464
|
+
isinstance(summary_item, dict)
|
|
465
|
+
and summary_item.get("type") == "summary_text"
|
|
466
|
+
):
|
|
423
467
|
reasoning_text = summary_item.get("text", "")
|
|
424
468
|
if reasoning_text:
|
|
425
469
|
current_assistant_content.append(f"Thought: {reasoning_text}")
|
|
426
|
-
|
|
470
|
+
|
|
427
471
|
elif message_type == "computer_call":
|
|
428
472
|
# Convert computer action to UITARS action format
|
|
429
473
|
action = message.get("action", {})
|
|
430
474
|
action_type = action.get("type")
|
|
431
|
-
|
|
475
|
+
|
|
432
476
|
if action_type == "click":
|
|
433
477
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
434
478
|
button = action.get("button", "left")
|
|
@@ -438,59 +482,65 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
|
|
|
438
482
|
action_text = f"Action: right_single(start_box='({x},{y})')"
|
|
439
483
|
else:
|
|
440
484
|
action_text = f"Action: click(start_box='({x},{y})')"
|
|
441
|
-
|
|
485
|
+
|
|
442
486
|
elif action_type == "double_click":
|
|
443
487
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
444
488
|
action_text = f"Action: left_double(start_box='({x},{y})')"
|
|
445
|
-
|
|
489
|
+
|
|
446
490
|
elif action_type == "drag":
|
|
447
491
|
start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
|
|
448
492
|
end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
|
|
449
493
|
action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
|
|
450
|
-
|
|
494
|
+
|
|
451
495
|
elif action_type == "key":
|
|
452
496
|
key = action.get("key", "")
|
|
453
497
|
action_text = f"Action: hotkey(key='{key}')"
|
|
454
|
-
|
|
498
|
+
|
|
455
499
|
elif action_type == "type":
|
|
456
500
|
text = action.get("text", "")
|
|
457
501
|
# Escape single quotes in the text
|
|
458
502
|
escaped_text = escape_single_quotes(text)
|
|
459
503
|
action_text = f"Action: type(content='{escaped_text}')"
|
|
460
|
-
|
|
504
|
+
|
|
461
505
|
elif action_type == "scroll":
|
|
462
506
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
463
507
|
direction = action.get("direction", "down")
|
|
464
508
|
action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
|
|
465
|
-
|
|
509
|
+
|
|
466
510
|
elif action_type == "wait":
|
|
467
511
|
action_text = "Action: wait()"
|
|
468
|
-
|
|
512
|
+
|
|
469
513
|
else:
|
|
470
514
|
# Fallback for unknown action types
|
|
471
515
|
action_text = f"Action: {action_type}({action})"
|
|
472
|
-
|
|
516
|
+
|
|
473
517
|
current_assistant_content.append(action_text)
|
|
474
|
-
|
|
518
|
+
|
|
475
519
|
# When we hit a computer_call_output, finalize the current assistant message
|
|
476
520
|
if current_assistant_content:
|
|
477
|
-
litellm_messages.append(
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
521
|
+
litellm_messages.append(
|
|
522
|
+
{
|
|
523
|
+
"role": "assistant",
|
|
524
|
+
"content": [
|
|
525
|
+
{"type": "text", "text": "\n".join(current_assistant_content)}
|
|
526
|
+
],
|
|
527
|
+
}
|
|
528
|
+
)
|
|
481
529
|
current_assistant_content = []
|
|
482
|
-
|
|
530
|
+
|
|
483
531
|
elif message_type == "computer_call_output":
|
|
484
532
|
# Add screenshot from computer call output
|
|
485
533
|
output = message.get("output", {})
|
|
486
534
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
487
535
|
image_url = output.get("image_url", "")
|
|
488
536
|
if image_url:
|
|
489
|
-
litellm_messages.append(
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
537
|
+
litellm_messages.append(
|
|
538
|
+
{
|
|
539
|
+
"role": "user",
|
|
540
|
+
"content": [{"type": "image_url", "image_url": {"url": image_url}}],
|
|
541
|
+
}
|
|
542
|
+
)
|
|
543
|
+
|
|
494
544
|
elif message.get("role") == "user":
|
|
495
545
|
# # Handle user messages
|
|
496
546
|
# content = message.get("content", "")
|
|
@@ -505,24 +555,27 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
|
|
|
505
555
|
# "content": content
|
|
506
556
|
# })
|
|
507
557
|
pass
|
|
508
|
-
|
|
558
|
+
|
|
509
559
|
# Add any remaining assistant content
|
|
510
560
|
if current_assistant_content:
|
|
511
|
-
litellm_messages.append(
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
561
|
+
litellm_messages.append(
|
|
562
|
+
{
|
|
563
|
+
"role": "assistant",
|
|
564
|
+
"content": [{"type": "text", "text": "\n".join(current_assistant_content)}],
|
|
565
|
+
}
|
|
566
|
+
)
|
|
567
|
+
|
|
516
568
|
return litellm_messages
|
|
517
569
|
|
|
518
|
-
|
|
570
|
+
|
|
571
|
+
@register_agent(models=r"(?i).*ui-?tars.*", priority=-1)
|
|
519
572
|
class UITARSConfig:
|
|
520
573
|
"""
|
|
521
574
|
UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
|
|
522
|
-
|
|
575
|
+
|
|
523
576
|
Supports UITARS vision-language models for computer control.
|
|
524
577
|
"""
|
|
525
|
-
|
|
578
|
+
|
|
526
579
|
async def predict_step(
|
|
527
580
|
self,
|
|
528
581
|
messages: List[Dict[str, Any]],
|
|
@@ -536,11 +589,11 @@ class UITARSConfig:
|
|
|
536
589
|
_on_api_end=None,
|
|
537
590
|
_on_usage=None,
|
|
538
591
|
_on_screenshot=None,
|
|
539
|
-
**kwargs
|
|
592
|
+
**kwargs,
|
|
540
593
|
) -> Dict[str, Any]:
|
|
541
594
|
"""
|
|
542
595
|
Predict the next step based on input messages.
|
|
543
|
-
|
|
596
|
+
|
|
544
597
|
Args:
|
|
545
598
|
messages: Input messages following Responses format
|
|
546
599
|
model: Model name to use
|
|
@@ -553,22 +606,22 @@ class UITARSConfig:
|
|
|
553
606
|
_on_usage: Callback for usage tracking
|
|
554
607
|
_on_screenshot: Callback for screenshot events
|
|
555
608
|
**kwargs: Additional arguments
|
|
556
|
-
|
|
609
|
+
|
|
557
610
|
Returns:
|
|
558
611
|
Dictionary with "output" (output items) and "usage" array
|
|
559
612
|
"""
|
|
560
613
|
tools = tools or []
|
|
561
|
-
|
|
614
|
+
|
|
562
615
|
# Create response items
|
|
563
616
|
response_items = []
|
|
564
|
-
|
|
617
|
+
|
|
565
618
|
# Find computer tool for screen dimensions
|
|
566
619
|
computer_tool = None
|
|
567
620
|
for tool_schema in tools:
|
|
568
621
|
if tool_schema["type"] == "computer":
|
|
569
622
|
computer_tool = tool_schema["computer"]
|
|
570
623
|
break
|
|
571
|
-
|
|
624
|
+
|
|
572
625
|
# Get screen dimensions
|
|
573
626
|
screen_width, screen_height = 1024, 768
|
|
574
627
|
if computer_tool:
|
|
@@ -576,20 +629,20 @@ class UITARSConfig:
|
|
|
576
629
|
screen_width, screen_height = await computer_tool.get_dimensions()
|
|
577
630
|
except:
|
|
578
631
|
pass
|
|
579
|
-
|
|
632
|
+
|
|
580
633
|
# Process messages to extract instruction and image
|
|
581
634
|
instruction = ""
|
|
582
635
|
image_data = None
|
|
583
|
-
|
|
636
|
+
|
|
584
637
|
# Convert messages to list if string
|
|
585
638
|
if isinstance(messages, str):
|
|
586
639
|
messages = [{"role": "user", "content": messages}]
|
|
587
|
-
|
|
640
|
+
|
|
588
641
|
# Extract instruction and latest screenshot
|
|
589
642
|
for message in reversed(messages):
|
|
590
643
|
if isinstance(message, dict):
|
|
591
644
|
content = message.get("content", "")
|
|
592
|
-
|
|
645
|
+
|
|
593
646
|
# Handle different content formats
|
|
594
647
|
if isinstance(content, str):
|
|
595
648
|
if not instruction and message.get("role") == "user":
|
|
@@ -605,46 +658,41 @@ class UITARSConfig:
|
|
|
605
658
|
image_data = image_url.get("url", "")
|
|
606
659
|
else:
|
|
607
660
|
image_data = image_url
|
|
608
|
-
|
|
661
|
+
|
|
609
662
|
# Also check for computer_call_output with screenshots
|
|
610
663
|
if message.get("type") == "computer_call_output" and not image_data:
|
|
611
664
|
output = message.get("output", {})
|
|
612
665
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
613
666
|
image_data = output.get("image_url", "")
|
|
614
|
-
|
|
667
|
+
|
|
615
668
|
if instruction and image_data:
|
|
616
669
|
break
|
|
617
|
-
|
|
670
|
+
|
|
618
671
|
if not instruction:
|
|
619
|
-
instruction =
|
|
620
|
-
|
|
672
|
+
instruction = (
|
|
673
|
+
"Help me complete this task by analyzing the screen and taking appropriate actions."
|
|
674
|
+
)
|
|
675
|
+
|
|
621
676
|
# Create prompt
|
|
622
677
|
user_prompt = UITARS_PROMPT_TEMPLATE.format(
|
|
623
|
-
instruction=instruction,
|
|
624
|
-
action_space=UITARS_ACTION_SPACE,
|
|
625
|
-
language="English"
|
|
678
|
+
instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
|
|
626
679
|
)
|
|
627
|
-
|
|
680
|
+
|
|
628
681
|
# Convert conversation history to LiteLLM format
|
|
629
682
|
history_messages = convert_uitars_messages_to_litellm(messages)
|
|
630
|
-
|
|
683
|
+
|
|
631
684
|
# Prepare messages for liteLLM
|
|
632
|
-
litellm_messages = [
|
|
633
|
-
{
|
|
634
|
-
"role": "system",
|
|
635
|
-
"content": "You are a helpful assistant."
|
|
636
|
-
}
|
|
637
|
-
]
|
|
685
|
+
litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
|
638
686
|
|
|
639
687
|
# Add current user instruction with screenshot
|
|
640
688
|
current_user_message = {
|
|
641
|
-
"role": "user",
|
|
689
|
+
"role": "user",
|
|
642
690
|
"content": [
|
|
643
691
|
{"type": "text", "text": user_prompt},
|
|
644
|
-
]
|
|
692
|
+
],
|
|
645
693
|
}
|
|
646
694
|
litellm_messages.append(current_user_message)
|
|
647
|
-
|
|
695
|
+
|
|
648
696
|
# Process image for UITARS
|
|
649
697
|
if not image_data:
|
|
650
698
|
# Take screenshot if none found in messages
|
|
@@ -658,17 +706,22 @@ class UITARSConfig:
|
|
|
658
706
|
raise ValueError("No screenshot found in messages and no computer_handler provided")
|
|
659
707
|
processed_image, original_width, original_height = process_image_for_uitars(image_data)
|
|
660
708
|
encoded_image = pil_to_base64(processed_image)
|
|
661
|
-
|
|
709
|
+
|
|
662
710
|
# Add conversation history
|
|
663
711
|
if history_messages:
|
|
664
712
|
litellm_messages.extend(history_messages)
|
|
665
713
|
else:
|
|
666
|
-
litellm_messages.append(
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
714
|
+
litellm_messages.append(
|
|
715
|
+
{
|
|
716
|
+
"role": "user",
|
|
717
|
+
"content": [
|
|
718
|
+
{
|
|
719
|
+
"type": "image_url",
|
|
720
|
+
"image_url": {"url": f"data:image/png;base64,{encoded_image}"},
|
|
721
|
+
}
|
|
722
|
+
],
|
|
723
|
+
}
|
|
724
|
+
)
|
|
672
725
|
|
|
673
726
|
# Prepare API call kwargs
|
|
674
727
|
api_kwargs = {
|
|
@@ -678,146 +731,143 @@ class UITARSConfig:
|
|
|
678
731
|
"temperature": kwargs.get("temperature", 0.0),
|
|
679
732
|
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
|
680
733
|
"num_retries": max_retries,
|
|
681
|
-
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
|
|
734
|
+
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
|
|
682
735
|
}
|
|
683
|
-
|
|
736
|
+
|
|
684
737
|
# Call API start hook
|
|
685
738
|
if _on_api_start:
|
|
686
739
|
await _on_api_start(api_kwargs)
|
|
687
|
-
|
|
740
|
+
|
|
688
741
|
# Call liteLLM with UITARS model
|
|
689
742
|
response = await litellm.acompletion(**api_kwargs)
|
|
690
|
-
|
|
743
|
+
|
|
691
744
|
# Call API end hook
|
|
692
745
|
if _on_api_end:
|
|
693
746
|
await _on_api_end(api_kwargs, response)
|
|
694
|
-
|
|
747
|
+
|
|
695
748
|
# Extract response content
|
|
696
|
-
response_content = response.choices[0].message.content.strip()
|
|
697
|
-
|
|
749
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
750
|
+
|
|
698
751
|
# Parse UITARS response
|
|
699
752
|
parsed_responses = parse_uitars_response(response_content, original_width, original_height)
|
|
700
|
-
|
|
753
|
+
|
|
701
754
|
# Convert to computer actions
|
|
702
|
-
computer_actions = convert_to_computer_actions(
|
|
703
|
-
|
|
755
|
+
computer_actions = convert_to_computer_actions(
|
|
756
|
+
parsed_responses, original_width, original_height
|
|
757
|
+
)
|
|
758
|
+
|
|
704
759
|
# Add computer actions to response items
|
|
705
760
|
thought = parsed_responses[0].get("thought", "")
|
|
706
761
|
if thought:
|
|
707
762
|
response_items.append(make_reasoning_item(thought))
|
|
708
763
|
response_items.extend(computer_actions)
|
|
709
|
-
|
|
764
|
+
|
|
710
765
|
# Extract usage information
|
|
711
766
|
response_usage = {
|
|
712
|
-
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
767
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
768
|
+
response.usage
|
|
769
|
+
).model_dump(),
|
|
713
770
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
714
771
|
}
|
|
715
772
|
if _on_usage:
|
|
716
773
|
await _on_usage(response_usage)
|
|
717
774
|
|
|
718
775
|
# Create agent response
|
|
719
|
-
agent_response = {
|
|
720
|
-
|
|
721
|
-
"usage": response_usage
|
|
722
|
-
}
|
|
723
|
-
|
|
776
|
+
agent_response = {"output": response_items, "usage": response_usage}
|
|
777
|
+
|
|
724
778
|
return agent_response
|
|
725
|
-
|
|
779
|
+
|
|
726
780
|
async def predict_click(
|
|
727
|
-
self,
|
|
728
|
-
model: str,
|
|
729
|
-
image_b64: str,
|
|
730
|
-
instruction: str
|
|
781
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
731
782
|
) -> Optional[Tuple[int, int]]:
|
|
732
783
|
"""
|
|
733
784
|
Predict click coordinates based on image and instruction.
|
|
734
|
-
|
|
785
|
+
|
|
735
786
|
UITARS supports click prediction through its action parsing.
|
|
736
|
-
|
|
787
|
+
|
|
737
788
|
Args:
|
|
738
789
|
model: Model name to use
|
|
739
790
|
image_b64: Base64 encoded image
|
|
740
791
|
instruction: Instruction for where to click
|
|
741
|
-
|
|
792
|
+
|
|
742
793
|
Returns:
|
|
743
794
|
Tuple with (x, y) coordinates or None
|
|
744
795
|
"""
|
|
745
796
|
try:
|
|
746
797
|
# Create prompt using grounding template
|
|
747
|
-
user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
|
|
748
|
-
|
|
749
|
-
)
|
|
750
|
-
|
|
798
|
+
user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
|
|
799
|
+
|
|
751
800
|
# Process image for UITARS
|
|
752
801
|
processed_image, original_width, original_height = process_image_for_uitars(image_b64)
|
|
753
802
|
encoded_image = pil_to_base64(processed_image)
|
|
754
|
-
|
|
803
|
+
|
|
755
804
|
# Prepare messages for liteLLM
|
|
756
805
|
litellm_messages = [
|
|
757
|
-
{
|
|
758
|
-
"role": "system",
|
|
759
|
-
"content": "You are a helpful assistant."
|
|
760
|
-
},
|
|
806
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
761
807
|
{
|
|
762
808
|
"role": "user",
|
|
763
809
|
"content": [
|
|
764
810
|
{"type": "text", "text": user_prompt},
|
|
765
|
-
{
|
|
766
|
-
|
|
767
|
-
|
|
811
|
+
{
|
|
812
|
+
"type": "image_url",
|
|
813
|
+
"image_url": {"url": f"data:image/png;base64,{encoded_image}"},
|
|
814
|
+
},
|
|
815
|
+
],
|
|
816
|
+
},
|
|
768
817
|
]
|
|
769
|
-
|
|
818
|
+
|
|
770
819
|
# Prepare API call kwargs
|
|
771
820
|
api_kwargs = {
|
|
772
821
|
"model": model,
|
|
773
822
|
"messages": litellm_messages,
|
|
774
|
-
"max_tokens":
|
|
823
|
+
"max_tokens": 2056,
|
|
775
824
|
"temperature": 0.0,
|
|
776
|
-
"do_sample": False
|
|
825
|
+
"do_sample": False,
|
|
777
826
|
}
|
|
778
|
-
|
|
827
|
+
api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
|
|
828
|
+
|
|
779
829
|
# Call liteLLM with UITARS model
|
|
780
830
|
response = await litellm.acompletion(**api_kwargs)
|
|
781
|
-
|
|
831
|
+
|
|
782
832
|
# Extract response content
|
|
783
|
-
response_content = response.choices[0].message.content.strip()
|
|
784
|
-
|
|
833
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
834
|
+
|
|
785
835
|
print(response_content)
|
|
786
836
|
|
|
787
837
|
# Parse the response to extract click coordinates
|
|
788
838
|
# Look for click action with coordinates (with special tokens)
|
|
789
839
|
click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
|
|
790
840
|
match = re.search(click_pattern, response_content)
|
|
791
|
-
|
|
841
|
+
|
|
792
842
|
# Fallback: Look for simpler format without special tokens
|
|
793
843
|
if not match:
|
|
794
844
|
# Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
|
|
795
845
|
fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
|
|
796
846
|
match = re.search(fallback_pattern, response_content)
|
|
797
|
-
|
|
847
|
+
|
|
798
848
|
if match:
|
|
799
849
|
x, y = int(match.group(1)), int(match.group(2))
|
|
800
850
|
# Scale coordinates back to original image dimensions
|
|
801
851
|
scale_x = original_width / processed_image.width
|
|
802
852
|
scale_y = original_height / processed_image.height
|
|
803
|
-
|
|
853
|
+
|
|
804
854
|
scaled_x = int(x * scale_x)
|
|
805
855
|
scaled_y = int(y * scale_y)
|
|
806
|
-
|
|
856
|
+
|
|
807
857
|
return (scaled_x, scaled_y)
|
|
808
|
-
|
|
858
|
+
|
|
809
859
|
return None
|
|
810
|
-
|
|
860
|
+
|
|
811
861
|
except Exception as e:
|
|
812
862
|
# Log error and return None
|
|
813
863
|
print(f"Error in predict_click: {e}")
|
|
814
864
|
return None
|
|
815
|
-
|
|
865
|
+
|
|
816
866
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
817
867
|
"""
|
|
818
868
|
Get list of capabilities supported by this agent config.
|
|
819
|
-
|
|
869
|
+
|
|
820
870
|
Returns:
|
|
821
871
|
List of capability strings
|
|
822
872
|
"""
|
|
823
|
-
return ["step", "click"]
|
|
873
|
+
return ["step", "click"]
|