cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py
CHANGED
|
@@ -4,39 +4,50 @@ Paper: https://arxiv.org/abs/2501.12326
|
|
|
4
4
|
Code: https://github.com/bytedance/UI-TARS
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import ast
|
|
7
8
|
import asyncio
|
|
8
|
-
from ctypes import cast
|
|
9
|
-
import json
|
|
10
9
|
import base64
|
|
10
|
+
import json
|
|
11
11
|
import math
|
|
12
12
|
import re
|
|
13
|
-
import
|
|
14
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
13
|
+
from ctypes import cast
|
|
15
14
|
from io import BytesIO
|
|
16
|
-
from
|
|
15
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
16
|
+
|
|
17
17
|
import litellm
|
|
18
|
-
from litellm.
|
|
19
|
-
|
|
18
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
19
|
+
LiteLLMCompletionResponsesConfig,
|
|
20
|
+
)
|
|
20
21
|
from litellm.responses.utils import Usage
|
|
21
|
-
from
|
|
22
|
+
from litellm.types.utils import ModelResponse
|
|
23
|
+
from openai.types.responses.response_computer_tool_call_param import (
|
|
24
|
+
ActionType,
|
|
25
|
+
ResponseComputerToolCallParam,
|
|
26
|
+
)
|
|
22
27
|
from openai.types.responses.response_input_param import ComputerCallOutput
|
|
23
|
-
from openai.types.responses.response_output_message_param import
|
|
24
|
-
|
|
28
|
+
from openai.types.responses.response_output_message_param import (
|
|
29
|
+
ResponseOutputMessageParam,
|
|
30
|
+
)
|
|
31
|
+
from openai.types.responses.response_reasoning_item_param import (
|
|
32
|
+
ResponseReasoningItemParam,
|
|
33
|
+
Summary,
|
|
34
|
+
)
|
|
35
|
+
from PIL import Image
|
|
25
36
|
|
|
26
37
|
from ..decorators import register_agent
|
|
27
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
28
38
|
from ..responses import (
|
|
29
|
-
make_reasoning_item,
|
|
30
|
-
make_output_text_item,
|
|
31
39
|
make_click_item,
|
|
32
40
|
make_double_click_item,
|
|
33
41
|
make_drag_item,
|
|
42
|
+
make_input_image_item,
|
|
34
43
|
make_keypress_item,
|
|
44
|
+
make_output_text_item,
|
|
45
|
+
make_reasoning_item,
|
|
35
46
|
make_scroll_item,
|
|
36
47
|
make_type_item,
|
|
37
48
|
make_wait_item,
|
|
38
|
-
make_input_image_item
|
|
39
49
|
)
|
|
50
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
40
51
|
|
|
41
52
|
# Constants from reference code
|
|
42
53
|
IMAGE_FACTOR = 28
|
|
@@ -94,6 +105,7 @@ click(point='<|box_start|>(x1,y1)<|box_end|>')
|
|
|
94
105
|
## User Instruction
|
|
95
106
|
{instruction}"""
|
|
96
107
|
|
|
108
|
+
|
|
97
109
|
def round_by_factor(number: float, factor: int) -> int:
|
|
98
110
|
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
99
111
|
return round(number / factor) * factor
|
|
@@ -110,7 +122,11 @@ def floor_by_factor(number: float, factor: int) -> int:
|
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def smart_resize(
|
|
113
|
-
height: int,
|
|
125
|
+
height: int,
|
|
126
|
+
width: int,
|
|
127
|
+
factor: int = IMAGE_FACTOR,
|
|
128
|
+
min_pixels: int = MIN_PIXELS,
|
|
129
|
+
max_pixels: int = MAX_PIXELS,
|
|
114
130
|
) -> tuple[int, int]:
|
|
115
131
|
"""
|
|
116
132
|
Rescales the image so that the following conditions are met:
|
|
@@ -144,14 +160,14 @@ def escape_single_quotes(text):
|
|
|
144
160
|
def parse_action(action_str):
|
|
145
161
|
"""Parse action string into structured format."""
|
|
146
162
|
try:
|
|
147
|
-
node = ast.parse(action_str, mode=
|
|
163
|
+
node = ast.parse(action_str, mode="eval")
|
|
148
164
|
if not isinstance(node, ast.Expression):
|
|
149
165
|
raise ValueError("Not an expression")
|
|
150
|
-
|
|
166
|
+
|
|
151
167
|
call = node.body
|
|
152
168
|
if not isinstance(call, ast.Call):
|
|
153
169
|
raise ValueError("Not a function call")
|
|
154
|
-
|
|
170
|
+
|
|
155
171
|
# Get function name
|
|
156
172
|
if isinstance(call.func, ast.Name):
|
|
157
173
|
func_name = call.func.id
|
|
@@ -159,7 +175,7 @@ def parse_action(action_str):
|
|
|
159
175
|
func_name = call.func.attr
|
|
160
176
|
else:
|
|
161
177
|
func_name = None
|
|
162
|
-
|
|
178
|
+
|
|
163
179
|
# Get keyword arguments
|
|
164
180
|
kwargs = {}
|
|
165
181
|
for kw in call.keywords:
|
|
@@ -171,12 +187,9 @@ def parse_action(action_str):
|
|
|
171
187
|
else:
|
|
172
188
|
value = None
|
|
173
189
|
kwargs[key] = value
|
|
174
|
-
|
|
175
|
-
return {
|
|
176
|
-
|
|
177
|
-
'args': kwargs
|
|
178
|
-
}
|
|
179
|
-
|
|
190
|
+
|
|
191
|
+
return {"function": func_name, "args": kwargs}
|
|
192
|
+
|
|
180
193
|
except Exception as e:
|
|
181
194
|
print(f"Failed to parse action '{action_str}': {e}")
|
|
182
195
|
return None
|
|
@@ -185,39 +198,39 @@ def parse_action(action_str):
|
|
|
185
198
|
def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
|
|
186
199
|
"""Parse UITARS model response into structured actions."""
|
|
187
200
|
text = text.strip()
|
|
188
|
-
|
|
201
|
+
|
|
189
202
|
# Extract thought
|
|
190
203
|
thought = None
|
|
191
204
|
if text.startswith("Thought:"):
|
|
192
205
|
thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
|
|
193
206
|
if thought_match:
|
|
194
207
|
thought = thought_match.group(1).strip()
|
|
195
|
-
|
|
208
|
+
|
|
196
209
|
# Extract action
|
|
197
210
|
if "Action:" not in text:
|
|
198
211
|
raise ValueError("No Action found in response")
|
|
199
|
-
|
|
212
|
+
|
|
200
213
|
action_str = text.split("Action:")[-1].strip()
|
|
201
214
|
|
|
202
215
|
# Handle special case for type actions
|
|
203
216
|
if "type(content" in action_str:
|
|
217
|
+
|
|
204
218
|
def escape_quotes(match):
|
|
205
219
|
return match.group(1)
|
|
206
|
-
|
|
220
|
+
|
|
207
221
|
pattern = r"type\(content='(.*?)'\)"
|
|
208
222
|
content = re.sub(pattern, escape_quotes, action_str)
|
|
209
223
|
action_str = escape_single_quotes(content)
|
|
210
224
|
action_str = "type(content='" + action_str + "')"
|
|
211
|
-
|
|
212
|
-
|
|
225
|
+
|
|
213
226
|
# Parse the action
|
|
214
227
|
parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
|
|
215
228
|
if parsed_action is None:
|
|
216
229
|
raise ValueError(f"Action can't parse: {action_str}")
|
|
217
|
-
|
|
230
|
+
|
|
218
231
|
action_type = parsed_action["function"]
|
|
219
232
|
params = parsed_action["args"]
|
|
220
|
-
|
|
233
|
+
|
|
221
234
|
# Process parameters
|
|
222
235
|
action_inputs = {}
|
|
223
236
|
for param_name, param in params.items():
|
|
@@ -225,7 +238,7 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
|
|
|
225
238
|
continue
|
|
226
239
|
param = str(param).lstrip()
|
|
227
240
|
action_inputs[param_name.strip()] = param
|
|
228
|
-
|
|
241
|
+
|
|
229
242
|
# Handle coordinate parameters
|
|
230
243
|
if "start_box" in param_name or "end_box" in param_name:
|
|
231
244
|
# Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
|
|
@@ -233,117 +246,130 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
|
|
|
233
246
|
clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
|
|
234
247
|
# Then remove parentheses and split
|
|
235
248
|
numbers = clean_param.replace("(", "").replace(")", "").split(",")
|
|
236
|
-
|
|
249
|
+
|
|
237
250
|
try:
|
|
238
|
-
float_numbers = [
|
|
239
|
-
|
|
251
|
+
float_numbers = [
|
|
252
|
+
float(num.strip()) / 1000 for num in numbers
|
|
253
|
+
] # Normalize to 0-1 range
|
|
254
|
+
|
|
240
255
|
if len(float_numbers) == 2:
|
|
241
256
|
# Single point, duplicate for box format
|
|
242
|
-
float_numbers = [
|
|
243
|
-
|
|
257
|
+
float_numbers = [
|
|
258
|
+
float_numbers[0],
|
|
259
|
+
float_numbers[1],
|
|
260
|
+
float_numbers[0],
|
|
261
|
+
float_numbers[1],
|
|
262
|
+
]
|
|
263
|
+
|
|
244
264
|
action_inputs[param_name.strip()] = str(float_numbers)
|
|
245
265
|
except ValueError as e:
|
|
246
266
|
# If parsing fails, keep the original parameter value
|
|
247
267
|
print(f"Warning: Could not parse coordinates '{param}': {e}")
|
|
248
268
|
action_inputs[param_name.strip()] = param
|
|
249
|
-
|
|
250
|
-
return [{
|
|
251
|
-
"thought": thought,
|
|
252
|
-
"action_type": action_type,
|
|
253
|
-
"action_inputs": action_inputs,
|
|
254
|
-
"text": text
|
|
255
|
-
}]
|
|
256
269
|
|
|
270
|
+
return [
|
|
271
|
+
{
|
|
272
|
+
"thought": thought,
|
|
273
|
+
"action_type": action_type,
|
|
274
|
+
"action_inputs": action_inputs,
|
|
275
|
+
"text": text,
|
|
276
|
+
}
|
|
277
|
+
]
|
|
257
278
|
|
|
258
|
-
|
|
279
|
+
|
|
280
|
+
def convert_to_computer_actions(
|
|
281
|
+
parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
|
|
282
|
+
) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
|
|
259
283
|
"""Convert parsed UITARS responses to computer actions."""
|
|
260
284
|
computer_actions = []
|
|
261
|
-
|
|
285
|
+
|
|
262
286
|
for response in parsed_responses:
|
|
263
287
|
action_type = response.get("action_type")
|
|
264
288
|
action_inputs = response.get("action_inputs", {})
|
|
265
|
-
|
|
289
|
+
|
|
266
290
|
if action_type == "finished":
|
|
267
291
|
finished_text = action_inputs.get("content", "Task completed successfully.")
|
|
268
292
|
computer_actions.append(make_output_text_item(finished_text))
|
|
269
293
|
break
|
|
270
|
-
|
|
294
|
+
|
|
271
295
|
elif action_type == "wait":
|
|
272
296
|
computer_actions.append(make_wait_item())
|
|
273
|
-
|
|
297
|
+
|
|
274
298
|
elif action_type == "call_user":
|
|
275
|
-
computer_actions.append(
|
|
276
|
-
|
|
299
|
+
computer_actions.append(
|
|
300
|
+
make_output_text_item("I need assistance from the user to proceed with this task.")
|
|
301
|
+
)
|
|
302
|
+
|
|
277
303
|
elif action_type in ["click", "left_single"]:
|
|
278
304
|
start_box = action_inputs.get("start_box")
|
|
279
305
|
if start_box:
|
|
280
306
|
coords = eval(start_box)
|
|
281
307
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
282
308
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
283
|
-
|
|
309
|
+
|
|
284
310
|
computer_actions.append(make_click_item(x, y, "left"))
|
|
285
|
-
|
|
286
|
-
elif action_type
|
|
311
|
+
|
|
312
|
+
elif action_type in ["double_click", "left_double"]:
|
|
287
313
|
start_box = action_inputs.get("start_box")
|
|
288
314
|
if start_box:
|
|
289
315
|
coords = eval(start_box)
|
|
290
316
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
291
317
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
292
|
-
|
|
318
|
+
|
|
293
319
|
computer_actions.append(make_double_click_item(x, y))
|
|
294
|
-
|
|
295
|
-
elif action_type
|
|
320
|
+
|
|
321
|
+
elif action_type in ["right_click", "right_single"]:
|
|
296
322
|
start_box = action_inputs.get("start_box")
|
|
297
323
|
if start_box:
|
|
298
324
|
coords = eval(start_box)
|
|
299
325
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
300
326
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
301
|
-
|
|
327
|
+
|
|
302
328
|
computer_actions.append(make_click_item(x, y, "right"))
|
|
303
|
-
|
|
329
|
+
|
|
304
330
|
elif action_type == "type":
|
|
305
331
|
content = action_inputs.get("content", "")
|
|
306
332
|
computer_actions.append(make_type_item(content))
|
|
307
|
-
|
|
333
|
+
|
|
308
334
|
elif action_type == "hotkey":
|
|
309
335
|
key = action_inputs.get("key", "")
|
|
310
336
|
keys = key.split()
|
|
311
337
|
computer_actions.append(make_keypress_item(keys))
|
|
312
|
-
|
|
338
|
+
|
|
313
339
|
elif action_type == "press":
|
|
314
340
|
key = action_inputs.get("key", "")
|
|
315
341
|
computer_actions.append(make_keypress_item([key]))
|
|
316
|
-
|
|
342
|
+
|
|
317
343
|
elif action_type == "scroll":
|
|
318
344
|
start_box = action_inputs.get("start_box")
|
|
319
345
|
direction = action_inputs.get("direction", "down")
|
|
320
|
-
|
|
346
|
+
|
|
321
347
|
if start_box:
|
|
322
348
|
coords = eval(start_box)
|
|
323
349
|
x = int((coords[0] + coords[2]) / 2 * image_width)
|
|
324
350
|
y = int((coords[1] + coords[3]) / 2 * image_height)
|
|
325
351
|
else:
|
|
326
352
|
x, y = image_width // 2, image_height // 2
|
|
327
|
-
|
|
353
|
+
|
|
328
354
|
scroll_y = 5 if "up" in direction.lower() else -5
|
|
329
355
|
computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
|
|
330
|
-
|
|
356
|
+
|
|
331
357
|
elif action_type == "drag":
|
|
332
358
|
start_box = action_inputs.get("start_box")
|
|
333
359
|
end_box = action_inputs.get("end_box")
|
|
334
|
-
|
|
360
|
+
|
|
335
361
|
if start_box and end_box:
|
|
336
362
|
start_coords = eval(start_box)
|
|
337
363
|
end_coords = eval(end_box)
|
|
338
|
-
|
|
364
|
+
|
|
339
365
|
start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
|
|
340
366
|
start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
|
|
341
367
|
end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
|
|
342
368
|
end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
|
|
343
|
-
|
|
369
|
+
|
|
344
370
|
path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
|
|
345
371
|
computer_actions.append(make_drag_item(path))
|
|
346
|
-
|
|
372
|
+
|
|
347
373
|
return computer_actions
|
|
348
374
|
|
|
349
375
|
|
|
@@ -354,33 +380,35 @@ def pil_to_base64(image: Image.Image) -> str:
|
|
|
354
380
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
355
381
|
|
|
356
382
|
|
|
357
|
-
def process_image_for_uitars(
|
|
383
|
+
def process_image_for_uitars(
|
|
384
|
+
image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
|
|
385
|
+
) -> tuple[Image.Image, int, int]:
|
|
358
386
|
"""Process image for UITARS model input."""
|
|
359
387
|
# Decode base64 image
|
|
360
|
-
if image_data.startswith(
|
|
361
|
-
image_data = image_data.split(
|
|
362
|
-
|
|
388
|
+
if image_data.startswith("data:image"):
|
|
389
|
+
image_data = image_data.split(",")[1]
|
|
390
|
+
|
|
363
391
|
image_bytes = base64.b64decode(image_data)
|
|
364
392
|
image = Image.open(BytesIO(image_bytes))
|
|
365
|
-
|
|
393
|
+
|
|
366
394
|
original_width, original_height = image.size
|
|
367
|
-
|
|
395
|
+
|
|
368
396
|
# Resize image according to UITARS requirements
|
|
369
397
|
if image.width * image.height > max_pixels:
|
|
370
398
|
resize_factor = math.sqrt(max_pixels / (image.width * image.height))
|
|
371
399
|
width = int(image.width * resize_factor)
|
|
372
400
|
height = int(image.height * resize_factor)
|
|
373
401
|
image = image.resize((width, height))
|
|
374
|
-
|
|
402
|
+
|
|
375
403
|
if image.width * image.height < min_pixels:
|
|
376
404
|
resize_factor = math.sqrt(min_pixels / (image.width * image.height))
|
|
377
405
|
width = math.ceil(image.width * resize_factor)
|
|
378
406
|
height = math.ceil(image.height * resize_factor)
|
|
379
407
|
image = image.resize((width, height))
|
|
380
|
-
|
|
408
|
+
|
|
381
409
|
if image.mode != "RGB":
|
|
382
410
|
image = image.convert("RGB")
|
|
383
|
-
|
|
411
|
+
|
|
384
412
|
return image, original_width, original_height
|
|
385
413
|
|
|
386
414
|
|
|
@@ -391,7 +419,11 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
391
419
|
for key, value in msg.items():
|
|
392
420
|
if key == "content" and isinstance(value, list):
|
|
393
421
|
result[key] = [
|
|
394
|
-
|
|
422
|
+
(
|
|
423
|
+
{k: v for k, v in item.items() if k != "image_url"}
|
|
424
|
+
if isinstance(item, dict)
|
|
425
|
+
else item
|
|
426
|
+
)
|
|
395
427
|
for item in value
|
|
396
428
|
]
|
|
397
429
|
else:
|
|
@@ -406,38 +438,41 @@ def sanitize_message(msg: Any) -> Any:
|
|
|
406
438
|
def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
|
|
407
439
|
"""
|
|
408
440
|
Convert UITARS internal message format back to LiteLLM format.
|
|
409
|
-
|
|
441
|
+
|
|
410
442
|
This function processes reasoning, computer_call, and computer_call_output messages
|
|
411
443
|
and converts them to the appropriate LiteLLM assistant message format.
|
|
412
|
-
|
|
444
|
+
|
|
413
445
|
Args:
|
|
414
446
|
messages: List of UITARS internal messages
|
|
415
|
-
|
|
447
|
+
|
|
416
448
|
Returns:
|
|
417
449
|
List of LiteLLM formatted messages
|
|
418
450
|
"""
|
|
419
451
|
litellm_messages = []
|
|
420
452
|
current_assistant_content = []
|
|
421
|
-
|
|
453
|
+
|
|
422
454
|
for message in messages:
|
|
423
455
|
if isinstance(message, dict):
|
|
424
456
|
message_type = message.get("type")
|
|
425
|
-
|
|
457
|
+
|
|
426
458
|
if message_type == "reasoning":
|
|
427
459
|
# Extract reasoning text from summary
|
|
428
460
|
summary = message.get("summary", [])
|
|
429
461
|
if summary and isinstance(summary, list):
|
|
430
462
|
for summary_item in summary:
|
|
431
|
-
if
|
|
463
|
+
if (
|
|
464
|
+
isinstance(summary_item, dict)
|
|
465
|
+
and summary_item.get("type") == "summary_text"
|
|
466
|
+
):
|
|
432
467
|
reasoning_text = summary_item.get("text", "")
|
|
433
468
|
if reasoning_text:
|
|
434
469
|
current_assistant_content.append(f"Thought: {reasoning_text}")
|
|
435
|
-
|
|
470
|
+
|
|
436
471
|
elif message_type == "computer_call":
|
|
437
472
|
# Convert computer action to UITARS action format
|
|
438
473
|
action = message.get("action", {})
|
|
439
474
|
action_type = action.get("type")
|
|
440
|
-
|
|
475
|
+
|
|
441
476
|
if action_type == "click":
|
|
442
477
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
443
478
|
button = action.get("button", "left")
|
|
@@ -447,59 +482,65 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
|
|
|
447
482
|
action_text = f"Action: right_single(start_box='({x},{y})')"
|
|
448
483
|
else:
|
|
449
484
|
action_text = f"Action: click(start_box='({x},{y})')"
|
|
450
|
-
|
|
485
|
+
|
|
451
486
|
elif action_type == "double_click":
|
|
452
487
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
453
488
|
action_text = f"Action: left_double(start_box='({x},{y})')"
|
|
454
|
-
|
|
489
|
+
|
|
455
490
|
elif action_type == "drag":
|
|
456
491
|
start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
|
|
457
492
|
end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
|
|
458
493
|
action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
|
|
459
|
-
|
|
494
|
+
|
|
460
495
|
elif action_type == "key":
|
|
461
496
|
key = action.get("key", "")
|
|
462
497
|
action_text = f"Action: hotkey(key='{key}')"
|
|
463
|
-
|
|
498
|
+
|
|
464
499
|
elif action_type == "type":
|
|
465
500
|
text = action.get("text", "")
|
|
466
501
|
# Escape single quotes in the text
|
|
467
502
|
escaped_text = escape_single_quotes(text)
|
|
468
503
|
action_text = f"Action: type(content='{escaped_text}')"
|
|
469
|
-
|
|
504
|
+
|
|
470
505
|
elif action_type == "scroll":
|
|
471
506
|
x, y = action.get("x", 0), action.get("y", 0)
|
|
472
507
|
direction = action.get("direction", "down")
|
|
473
508
|
action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
|
|
474
|
-
|
|
509
|
+
|
|
475
510
|
elif action_type == "wait":
|
|
476
511
|
action_text = "Action: wait()"
|
|
477
|
-
|
|
512
|
+
|
|
478
513
|
else:
|
|
479
514
|
# Fallback for unknown action types
|
|
480
515
|
action_text = f"Action: {action_type}({action})"
|
|
481
|
-
|
|
516
|
+
|
|
482
517
|
current_assistant_content.append(action_text)
|
|
483
|
-
|
|
518
|
+
|
|
484
519
|
# When we hit a computer_call_output, finalize the current assistant message
|
|
485
520
|
if current_assistant_content:
|
|
486
|
-
litellm_messages.append(
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
521
|
+
litellm_messages.append(
|
|
522
|
+
{
|
|
523
|
+
"role": "assistant",
|
|
524
|
+
"content": [
|
|
525
|
+
{"type": "text", "text": "\n".join(current_assistant_content)}
|
|
526
|
+
],
|
|
527
|
+
}
|
|
528
|
+
)
|
|
490
529
|
current_assistant_content = []
|
|
491
|
-
|
|
530
|
+
|
|
492
531
|
elif message_type == "computer_call_output":
|
|
493
532
|
# Add screenshot from computer call output
|
|
494
533
|
output = message.get("output", {})
|
|
495
534
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
496
535
|
image_url = output.get("image_url", "")
|
|
497
536
|
if image_url:
|
|
498
|
-
litellm_messages.append(
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
537
|
+
litellm_messages.append(
|
|
538
|
+
{
|
|
539
|
+
"role": "user",
|
|
540
|
+
"content": [{"type": "image_url", "image_url": {"url": image_url}}],
|
|
541
|
+
}
|
|
542
|
+
)
|
|
543
|
+
|
|
503
544
|
elif message.get("role") == "user":
|
|
504
545
|
# # Handle user messages
|
|
505
546
|
# content = message.get("content", "")
|
|
@@ -514,24 +555,27 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
|
|
|
514
555
|
# "content": content
|
|
515
556
|
# })
|
|
516
557
|
pass
|
|
517
|
-
|
|
558
|
+
|
|
518
559
|
# Add any remaining assistant content
|
|
519
560
|
if current_assistant_content:
|
|
520
|
-
litellm_messages.append(
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
561
|
+
litellm_messages.append(
|
|
562
|
+
{
|
|
563
|
+
"role": "assistant",
|
|
564
|
+
"content": [{"type": "text", "text": "\n".join(current_assistant_content)}],
|
|
565
|
+
}
|
|
566
|
+
)
|
|
567
|
+
|
|
525
568
|
return litellm_messages
|
|
526
569
|
|
|
527
|
-
|
|
570
|
+
|
|
571
|
+
@register_agent(models=r"(?i).*ui-?tars.*", priority=-1)
|
|
528
572
|
class UITARSConfig:
|
|
529
573
|
"""
|
|
530
574
|
UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
|
|
531
|
-
|
|
575
|
+
|
|
532
576
|
Supports UITARS vision-language models for computer control.
|
|
533
577
|
"""
|
|
534
|
-
|
|
578
|
+
|
|
535
579
|
async def predict_step(
|
|
536
580
|
self,
|
|
537
581
|
messages: List[Dict[str, Any]],
|
|
@@ -545,11 +589,11 @@ class UITARSConfig:
|
|
|
545
589
|
_on_api_end=None,
|
|
546
590
|
_on_usage=None,
|
|
547
591
|
_on_screenshot=None,
|
|
548
|
-
**kwargs
|
|
592
|
+
**kwargs,
|
|
549
593
|
) -> Dict[str, Any]:
|
|
550
594
|
"""
|
|
551
595
|
Predict the next step based on input messages.
|
|
552
|
-
|
|
596
|
+
|
|
553
597
|
Args:
|
|
554
598
|
messages: Input messages following Responses format
|
|
555
599
|
model: Model name to use
|
|
@@ -562,22 +606,22 @@ class UITARSConfig:
|
|
|
562
606
|
_on_usage: Callback for usage tracking
|
|
563
607
|
_on_screenshot: Callback for screenshot events
|
|
564
608
|
**kwargs: Additional arguments
|
|
565
|
-
|
|
609
|
+
|
|
566
610
|
Returns:
|
|
567
611
|
Dictionary with "output" (output items) and "usage" array
|
|
568
612
|
"""
|
|
569
613
|
tools = tools or []
|
|
570
|
-
|
|
614
|
+
|
|
571
615
|
# Create response items
|
|
572
616
|
response_items = []
|
|
573
|
-
|
|
617
|
+
|
|
574
618
|
# Find computer tool for screen dimensions
|
|
575
619
|
computer_tool = None
|
|
576
620
|
for tool_schema in tools:
|
|
577
621
|
if tool_schema["type"] == "computer":
|
|
578
622
|
computer_tool = tool_schema["computer"]
|
|
579
623
|
break
|
|
580
|
-
|
|
624
|
+
|
|
581
625
|
# Get screen dimensions
|
|
582
626
|
screen_width, screen_height = 1024, 768
|
|
583
627
|
if computer_tool:
|
|
@@ -585,20 +629,20 @@ class UITARSConfig:
|
|
|
585
629
|
screen_width, screen_height = await computer_tool.get_dimensions()
|
|
586
630
|
except:
|
|
587
631
|
pass
|
|
588
|
-
|
|
632
|
+
|
|
589
633
|
# Process messages to extract instruction and image
|
|
590
634
|
instruction = ""
|
|
591
635
|
image_data = None
|
|
592
|
-
|
|
636
|
+
|
|
593
637
|
# Convert messages to list if string
|
|
594
638
|
if isinstance(messages, str):
|
|
595
639
|
messages = [{"role": "user", "content": messages}]
|
|
596
|
-
|
|
640
|
+
|
|
597
641
|
# Extract instruction and latest screenshot
|
|
598
642
|
for message in reversed(messages):
|
|
599
643
|
if isinstance(message, dict):
|
|
600
644
|
content = message.get("content", "")
|
|
601
|
-
|
|
645
|
+
|
|
602
646
|
# Handle different content formats
|
|
603
647
|
if isinstance(content, str):
|
|
604
648
|
if not instruction and message.get("role") == "user":
|
|
@@ -614,46 +658,41 @@ class UITARSConfig:
|
|
|
614
658
|
image_data = image_url.get("url", "")
|
|
615
659
|
else:
|
|
616
660
|
image_data = image_url
|
|
617
|
-
|
|
661
|
+
|
|
618
662
|
# Also check for computer_call_output with screenshots
|
|
619
663
|
if message.get("type") == "computer_call_output" and not image_data:
|
|
620
664
|
output = message.get("output", {})
|
|
621
665
|
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
622
666
|
image_data = output.get("image_url", "")
|
|
623
|
-
|
|
667
|
+
|
|
624
668
|
if instruction and image_data:
|
|
625
669
|
break
|
|
626
|
-
|
|
670
|
+
|
|
627
671
|
if not instruction:
|
|
628
|
-
instruction =
|
|
629
|
-
|
|
672
|
+
instruction = (
|
|
673
|
+
"Help me complete this task by analyzing the screen and taking appropriate actions."
|
|
674
|
+
)
|
|
675
|
+
|
|
630
676
|
# Create prompt
|
|
631
677
|
user_prompt = UITARS_PROMPT_TEMPLATE.format(
|
|
632
|
-
instruction=instruction,
|
|
633
|
-
action_space=UITARS_ACTION_SPACE,
|
|
634
|
-
language="English"
|
|
678
|
+
instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
|
|
635
679
|
)
|
|
636
|
-
|
|
680
|
+
|
|
637
681
|
# Convert conversation history to LiteLLM format
|
|
638
682
|
history_messages = convert_uitars_messages_to_litellm(messages)
|
|
639
|
-
|
|
683
|
+
|
|
640
684
|
# Prepare messages for liteLLM
|
|
641
|
-
litellm_messages = [
|
|
642
|
-
{
|
|
643
|
-
"role": "system",
|
|
644
|
-
"content": "You are a helpful assistant."
|
|
645
|
-
}
|
|
646
|
-
]
|
|
685
|
+
litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
|
647
686
|
|
|
648
687
|
# Add current user instruction with screenshot
|
|
649
688
|
current_user_message = {
|
|
650
|
-
"role": "user",
|
|
689
|
+
"role": "user",
|
|
651
690
|
"content": [
|
|
652
691
|
{"type": "text", "text": user_prompt},
|
|
653
|
-
]
|
|
692
|
+
],
|
|
654
693
|
}
|
|
655
694
|
litellm_messages.append(current_user_message)
|
|
656
|
-
|
|
695
|
+
|
|
657
696
|
# Process image for UITARS
|
|
658
697
|
if not image_data:
|
|
659
698
|
# Take screenshot if none found in messages
|
|
@@ -667,17 +706,22 @@ class UITARSConfig:
|
|
|
667
706
|
raise ValueError("No screenshot found in messages and no computer_handler provided")
|
|
668
707
|
processed_image, original_width, original_height = process_image_for_uitars(image_data)
|
|
669
708
|
encoded_image = pil_to_base64(processed_image)
|
|
670
|
-
|
|
709
|
+
|
|
671
710
|
# Add conversation history
|
|
672
711
|
if history_messages:
|
|
673
712
|
litellm_messages.extend(history_messages)
|
|
674
713
|
else:
|
|
675
|
-
litellm_messages.append(
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
714
|
+
litellm_messages.append(
|
|
715
|
+
{
|
|
716
|
+
"role": "user",
|
|
717
|
+
"content": [
|
|
718
|
+
{
|
|
719
|
+
"type": "image_url",
|
|
720
|
+
"image_url": {"url": f"data:image/png;base64,{encoded_image}"},
|
|
721
|
+
}
|
|
722
|
+
],
|
|
723
|
+
}
|
|
724
|
+
)
|
|
681
725
|
|
|
682
726
|
# Prepare API call kwargs
|
|
683
727
|
api_kwargs = {
|
|
@@ -687,146 +731,143 @@ class UITARSConfig:
|
|
|
687
731
|
"temperature": kwargs.get("temperature", 0.0),
|
|
688
732
|
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
|
689
733
|
"num_retries": max_retries,
|
|
690
|
-
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
|
|
734
|
+
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
|
|
691
735
|
}
|
|
692
|
-
|
|
736
|
+
|
|
693
737
|
# Call API start hook
|
|
694
738
|
if _on_api_start:
|
|
695
739
|
await _on_api_start(api_kwargs)
|
|
696
|
-
|
|
740
|
+
|
|
697
741
|
# Call liteLLM with UITARS model
|
|
698
742
|
response = await litellm.acompletion(**api_kwargs)
|
|
699
|
-
|
|
743
|
+
|
|
700
744
|
# Call API end hook
|
|
701
745
|
if _on_api_end:
|
|
702
746
|
await _on_api_end(api_kwargs, response)
|
|
703
|
-
|
|
747
|
+
|
|
704
748
|
# Extract response content
|
|
705
|
-
response_content = response.choices[0].message.content.strip()
|
|
706
|
-
|
|
749
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
750
|
+
|
|
707
751
|
# Parse UITARS response
|
|
708
752
|
parsed_responses = parse_uitars_response(response_content, original_width, original_height)
|
|
709
|
-
|
|
753
|
+
|
|
710
754
|
# Convert to computer actions
|
|
711
|
-
computer_actions = convert_to_computer_actions(
|
|
712
|
-
|
|
755
|
+
computer_actions = convert_to_computer_actions(
|
|
756
|
+
parsed_responses, original_width, original_height
|
|
757
|
+
)
|
|
758
|
+
|
|
713
759
|
# Add computer actions to response items
|
|
714
760
|
thought = parsed_responses[0].get("thought", "")
|
|
715
761
|
if thought:
|
|
716
762
|
response_items.append(make_reasoning_item(thought))
|
|
717
763
|
response_items.extend(computer_actions)
|
|
718
|
-
|
|
764
|
+
|
|
719
765
|
# Extract usage information
|
|
720
766
|
response_usage = {
|
|
721
|
-
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
767
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
|
|
768
|
+
response.usage
|
|
769
|
+
).model_dump(),
|
|
722
770
|
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
723
771
|
}
|
|
724
772
|
if _on_usage:
|
|
725
773
|
await _on_usage(response_usage)
|
|
726
774
|
|
|
727
775
|
# Create agent response
|
|
728
|
-
agent_response = {
|
|
729
|
-
|
|
730
|
-
"usage": response_usage
|
|
731
|
-
}
|
|
732
|
-
|
|
776
|
+
agent_response = {"output": response_items, "usage": response_usage}
|
|
777
|
+
|
|
733
778
|
return agent_response
|
|
734
|
-
|
|
779
|
+
|
|
735
780
|
async def predict_click(
|
|
736
|
-
self,
|
|
737
|
-
model: str,
|
|
738
|
-
image_b64: str,
|
|
739
|
-
instruction: str
|
|
781
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
740
782
|
) -> Optional[Tuple[int, int]]:
|
|
741
783
|
"""
|
|
742
784
|
Predict click coordinates based on image and instruction.
|
|
743
|
-
|
|
785
|
+
|
|
744
786
|
UITARS supports click prediction through its action parsing.
|
|
745
|
-
|
|
787
|
+
|
|
746
788
|
Args:
|
|
747
789
|
model: Model name to use
|
|
748
790
|
image_b64: Base64 encoded image
|
|
749
791
|
instruction: Instruction for where to click
|
|
750
|
-
|
|
792
|
+
|
|
751
793
|
Returns:
|
|
752
794
|
Tuple with (x, y) coordinates or None
|
|
753
795
|
"""
|
|
754
796
|
try:
|
|
755
797
|
# Create prompt using grounding template
|
|
756
|
-
user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
|
|
757
|
-
|
|
758
|
-
)
|
|
759
|
-
|
|
798
|
+
user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
|
|
799
|
+
|
|
760
800
|
# Process image for UITARS
|
|
761
801
|
processed_image, original_width, original_height = process_image_for_uitars(image_b64)
|
|
762
802
|
encoded_image = pil_to_base64(processed_image)
|
|
763
|
-
|
|
803
|
+
|
|
764
804
|
# Prepare messages for liteLLM
|
|
765
805
|
litellm_messages = [
|
|
766
|
-
{
|
|
767
|
-
"role": "system",
|
|
768
|
-
"content": "You are a helpful assistant."
|
|
769
|
-
},
|
|
806
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
770
807
|
{
|
|
771
808
|
"role": "user",
|
|
772
809
|
"content": [
|
|
773
810
|
{"type": "text", "text": user_prompt},
|
|
774
|
-
{
|
|
775
|
-
|
|
776
|
-
|
|
811
|
+
{
|
|
812
|
+
"type": "image_url",
|
|
813
|
+
"image_url": {"url": f"data:image/png;base64,{encoded_image}"},
|
|
814
|
+
},
|
|
815
|
+
],
|
|
816
|
+
},
|
|
777
817
|
]
|
|
778
|
-
|
|
818
|
+
|
|
779
819
|
# Prepare API call kwargs
|
|
780
820
|
api_kwargs = {
|
|
781
821
|
"model": model,
|
|
782
822
|
"messages": litellm_messages,
|
|
783
|
-
"max_tokens":
|
|
823
|
+
"max_tokens": 2056,
|
|
784
824
|
"temperature": 0.0,
|
|
785
|
-
"do_sample": False
|
|
825
|
+
"do_sample": False,
|
|
786
826
|
}
|
|
787
|
-
|
|
827
|
+
api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
|
|
828
|
+
|
|
788
829
|
# Call liteLLM with UITARS model
|
|
789
830
|
response = await litellm.acompletion(**api_kwargs)
|
|
790
|
-
|
|
831
|
+
|
|
791
832
|
# Extract response content
|
|
792
|
-
response_content = response.choices[0].message.content.strip()
|
|
793
|
-
|
|
833
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
834
|
+
|
|
794
835
|
print(response_content)
|
|
795
836
|
|
|
796
837
|
# Parse the response to extract click coordinates
|
|
797
838
|
# Look for click action with coordinates (with special tokens)
|
|
798
839
|
click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
|
|
799
840
|
match = re.search(click_pattern, response_content)
|
|
800
|
-
|
|
841
|
+
|
|
801
842
|
# Fallback: Look for simpler format without special tokens
|
|
802
843
|
if not match:
|
|
803
844
|
# Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
|
|
804
845
|
fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
|
|
805
846
|
match = re.search(fallback_pattern, response_content)
|
|
806
|
-
|
|
847
|
+
|
|
807
848
|
if match:
|
|
808
849
|
x, y = int(match.group(1)), int(match.group(2))
|
|
809
850
|
# Scale coordinates back to original image dimensions
|
|
810
851
|
scale_x = original_width / processed_image.width
|
|
811
852
|
scale_y = original_height / processed_image.height
|
|
812
|
-
|
|
853
|
+
|
|
813
854
|
scaled_x = int(x * scale_x)
|
|
814
855
|
scaled_y = int(y * scale_y)
|
|
815
|
-
|
|
856
|
+
|
|
816
857
|
return (scaled_x, scaled_y)
|
|
817
|
-
|
|
858
|
+
|
|
818
859
|
return None
|
|
819
|
-
|
|
860
|
+
|
|
820
861
|
except Exception as e:
|
|
821
862
|
# Log error and return None
|
|
822
863
|
print(f"Error in predict_click: {e}")
|
|
823
864
|
return None
|
|
824
|
-
|
|
865
|
+
|
|
825
866
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
826
867
|
"""
|
|
827
868
|
Get list of capabilities supported by this agent config.
|
|
828
|
-
|
|
869
|
+
|
|
829
870
|
Returns:
|
|
830
871
|
List of capability strings
|
|
831
872
|
"""
|
|
832
|
-
return ["step", "click"]
|
|
873
|
+
return ["step", "click"]
|