cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py CHANGED
@@ -4,39 +4,50 @@ Paper: https://arxiv.org/abs/2501.12326
4
4
  Code: https://github.com/bytedance/UI-TARS
5
5
  """
6
6
 
7
+ import ast
7
8
  import asyncio
8
- from ctypes import cast
9
- import json
10
9
  import base64
10
+ import json
11
11
  import math
12
12
  import re
13
- import ast
14
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
13
+ from ctypes import cast
15
14
  from io import BytesIO
16
- from PIL import Image
15
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
16
+
17
17
  import litellm
18
- from litellm.types.utils import ModelResponse
19
- from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
18
+ from litellm.responses.litellm_completion_transformation.transformation import (
19
+ LiteLLMCompletionResponsesConfig,
20
+ )
20
21
  from litellm.responses.utils import Usage
21
- from openai.types.responses.response_computer_tool_call_param import ActionType, ResponseComputerToolCallParam
22
+ from litellm.types.utils import ModelResponse
23
+ from openai.types.responses.response_computer_tool_call_param import (
24
+ ActionType,
25
+ ResponseComputerToolCallParam,
26
+ )
22
27
  from openai.types.responses.response_input_param import ComputerCallOutput
23
- from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
24
- from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
28
+ from openai.types.responses.response_output_message_param import (
29
+ ResponseOutputMessageParam,
30
+ )
31
+ from openai.types.responses.response_reasoning_item_param import (
32
+ ResponseReasoningItemParam,
33
+ Summary,
34
+ )
35
+ from PIL import Image
25
36
 
26
37
  from ..decorators import register_agent
27
- from ..types import Messages, AgentResponse, Tools, AgentCapability
28
38
  from ..responses import (
29
- make_reasoning_item,
30
- make_output_text_item,
31
39
  make_click_item,
32
40
  make_double_click_item,
33
41
  make_drag_item,
42
+ make_input_image_item,
34
43
  make_keypress_item,
44
+ make_output_text_item,
45
+ make_reasoning_item,
35
46
  make_scroll_item,
36
47
  make_type_item,
37
48
  make_wait_item,
38
- make_input_image_item
39
49
  )
50
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
40
51
 
41
52
  # Constants from reference code
42
53
  IMAGE_FACTOR = 28
@@ -94,6 +105,7 @@ click(point='<|box_start|>(x1,y1)<|box_end|>')
94
105
  ## User Instruction
95
106
  {instruction}"""
96
107
 
108
+
97
109
  def round_by_factor(number: float, factor: int) -> int:
98
110
  """Returns the closest integer to 'number' that is divisible by 'factor'."""
99
111
  return round(number / factor) * factor
@@ -110,7 +122,11 @@ def floor_by_factor(number: float, factor: int) -> int:
110
122
 
111
123
 
112
124
  def smart_resize(
113
- height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
125
+ height: int,
126
+ width: int,
127
+ factor: int = IMAGE_FACTOR,
128
+ min_pixels: int = MIN_PIXELS,
129
+ max_pixels: int = MAX_PIXELS,
114
130
  ) -> tuple[int, int]:
115
131
  """
116
132
  Rescales the image so that the following conditions are met:
@@ -144,14 +160,14 @@ def escape_single_quotes(text):
144
160
  def parse_action(action_str):
145
161
  """Parse action string into structured format."""
146
162
  try:
147
- node = ast.parse(action_str, mode='eval')
163
+ node = ast.parse(action_str, mode="eval")
148
164
  if not isinstance(node, ast.Expression):
149
165
  raise ValueError("Not an expression")
150
-
166
+
151
167
  call = node.body
152
168
  if not isinstance(call, ast.Call):
153
169
  raise ValueError("Not a function call")
154
-
170
+
155
171
  # Get function name
156
172
  if isinstance(call.func, ast.Name):
157
173
  func_name = call.func.id
@@ -159,7 +175,7 @@ def parse_action(action_str):
159
175
  func_name = call.func.attr
160
176
  else:
161
177
  func_name = None
162
-
178
+
163
179
  # Get keyword arguments
164
180
  kwargs = {}
165
181
  for kw in call.keywords:
@@ -171,12 +187,9 @@ def parse_action(action_str):
171
187
  else:
172
188
  value = None
173
189
  kwargs[key] = value
174
-
175
- return {
176
- 'function': func_name,
177
- 'args': kwargs
178
- }
179
-
190
+
191
+ return {"function": func_name, "args": kwargs}
192
+
180
193
  except Exception as e:
181
194
  print(f"Failed to parse action '{action_str}': {e}")
182
195
  return None
@@ -185,39 +198,39 @@ def parse_action(action_str):
185
198
  def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
186
199
  """Parse UITARS model response into structured actions."""
187
200
  text = text.strip()
188
-
201
+
189
202
  # Extract thought
190
203
  thought = None
191
204
  if text.startswith("Thought:"):
192
205
  thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
193
206
  if thought_match:
194
207
  thought = thought_match.group(1).strip()
195
-
208
+
196
209
  # Extract action
197
210
  if "Action:" not in text:
198
211
  raise ValueError("No Action found in response")
199
-
212
+
200
213
  action_str = text.split("Action:")[-1].strip()
201
214
 
202
215
  # Handle special case for type actions
203
216
  if "type(content" in action_str:
217
+
204
218
  def escape_quotes(match):
205
219
  return match.group(1)
206
-
220
+
207
221
  pattern = r"type\(content='(.*?)'\)"
208
222
  content = re.sub(pattern, escape_quotes, action_str)
209
223
  action_str = escape_single_quotes(content)
210
224
  action_str = "type(content='" + action_str + "')"
211
-
212
-
225
+
213
226
  # Parse the action
214
227
  parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
215
228
  if parsed_action is None:
216
229
  raise ValueError(f"Action can't parse: {action_str}")
217
-
230
+
218
231
  action_type = parsed_action["function"]
219
232
  params = parsed_action["args"]
220
-
233
+
221
234
  # Process parameters
222
235
  action_inputs = {}
223
236
  for param_name, param in params.items():
@@ -225,116 +238,138 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
225
238
  continue
226
239
  param = str(param).lstrip()
227
240
  action_inputs[param_name.strip()] = param
228
-
241
+
229
242
  # Handle coordinate parameters
230
243
  if "start_box" in param_name or "end_box" in param_name:
231
- # Parse coordinates like '(x,y)' or '(x1,y1,x2,y2)'
232
- numbers = param.replace("(", "").replace(")", "").split(",")
233
- float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
234
-
235
- if len(float_numbers) == 2:
236
- # Single point, duplicate for box format
237
- float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
238
-
239
- action_inputs[param_name.strip()] = str(float_numbers)
240
-
241
- return [{
242
- "thought": thought,
243
- "action_type": action_type,
244
- "action_inputs": action_inputs,
245
- "text": text
246
- }]
247
-
248
-
249
- def convert_to_computer_actions(parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
244
+ # Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
245
+ # First, remove special tokens
246
+ clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
247
+ # Then remove parentheses and split
248
+ numbers = clean_param.replace("(", "").replace(")", "").split(",")
249
+
250
+ try:
251
+ float_numbers = [
252
+ float(num.strip()) / 1000 for num in numbers
253
+ ] # Normalize to 0-1 range
254
+
255
+ if len(float_numbers) == 2:
256
+ # Single point, duplicate for box format
257
+ float_numbers = [
258
+ float_numbers[0],
259
+ float_numbers[1],
260
+ float_numbers[0],
261
+ float_numbers[1],
262
+ ]
263
+
264
+ action_inputs[param_name.strip()] = str(float_numbers)
265
+ except ValueError as e:
266
+ # If parsing fails, keep the original parameter value
267
+ print(f"Warning: Could not parse coordinates '{param}': {e}")
268
+ action_inputs[param_name.strip()] = param
269
+
270
+ return [
271
+ {
272
+ "thought": thought,
273
+ "action_type": action_type,
274
+ "action_inputs": action_inputs,
275
+ "text": text,
276
+ }
277
+ ]
278
+
279
+
280
+ def convert_to_computer_actions(
281
+ parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
282
+ ) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
250
283
  """Convert parsed UITARS responses to computer actions."""
251
284
  computer_actions = []
252
-
285
+
253
286
  for response in parsed_responses:
254
287
  action_type = response.get("action_type")
255
288
  action_inputs = response.get("action_inputs", {})
256
-
289
+
257
290
  if action_type == "finished":
258
291
  finished_text = action_inputs.get("content", "Task completed successfully.")
259
292
  computer_actions.append(make_output_text_item(finished_text))
260
293
  break
261
-
294
+
262
295
  elif action_type == "wait":
263
296
  computer_actions.append(make_wait_item())
264
-
297
+
265
298
  elif action_type == "call_user":
266
- computer_actions.append(make_output_text_item("I need assistance from the user to proceed with this task."))
267
-
299
+ computer_actions.append(
300
+ make_output_text_item("I need assistance from the user to proceed with this task.")
301
+ )
302
+
268
303
  elif action_type in ["click", "left_single"]:
269
304
  start_box = action_inputs.get("start_box")
270
305
  if start_box:
271
306
  coords = eval(start_box)
272
307
  x = int((coords[0] + coords[2]) / 2 * image_width)
273
308
  y = int((coords[1] + coords[3]) / 2 * image_height)
274
-
309
+
275
310
  computer_actions.append(make_click_item(x, y, "left"))
276
-
277
- elif action_type == "double_click":
311
+
312
+ elif action_type in ["double_click", "left_double"]:
278
313
  start_box = action_inputs.get("start_box")
279
314
  if start_box:
280
315
  coords = eval(start_box)
281
316
  x = int((coords[0] + coords[2]) / 2 * image_width)
282
317
  y = int((coords[1] + coords[3]) / 2 * image_height)
283
-
318
+
284
319
  computer_actions.append(make_double_click_item(x, y))
285
-
286
- elif action_type == "right_click":
320
+
321
+ elif action_type in ["right_click", "right_single"]:
287
322
  start_box = action_inputs.get("start_box")
288
323
  if start_box:
289
324
  coords = eval(start_box)
290
325
  x = int((coords[0] + coords[2]) / 2 * image_width)
291
326
  y = int((coords[1] + coords[3]) / 2 * image_height)
292
-
327
+
293
328
  computer_actions.append(make_click_item(x, y, "right"))
294
-
329
+
295
330
  elif action_type == "type":
296
331
  content = action_inputs.get("content", "")
297
332
  computer_actions.append(make_type_item(content))
298
-
333
+
299
334
  elif action_type == "hotkey":
300
335
  key = action_inputs.get("key", "")
301
336
  keys = key.split()
302
337
  computer_actions.append(make_keypress_item(keys))
303
-
338
+
304
339
  elif action_type == "press":
305
340
  key = action_inputs.get("key", "")
306
341
  computer_actions.append(make_keypress_item([key]))
307
-
342
+
308
343
  elif action_type == "scroll":
309
344
  start_box = action_inputs.get("start_box")
310
345
  direction = action_inputs.get("direction", "down")
311
-
346
+
312
347
  if start_box:
313
348
  coords = eval(start_box)
314
349
  x = int((coords[0] + coords[2]) / 2 * image_width)
315
350
  y = int((coords[1] + coords[3]) / 2 * image_height)
316
351
  else:
317
352
  x, y = image_width // 2, image_height // 2
318
-
353
+
319
354
  scroll_y = 5 if "up" in direction.lower() else -5
320
355
  computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
321
-
356
+
322
357
  elif action_type == "drag":
323
358
  start_box = action_inputs.get("start_box")
324
359
  end_box = action_inputs.get("end_box")
325
-
360
+
326
361
  if start_box and end_box:
327
362
  start_coords = eval(start_box)
328
363
  end_coords = eval(end_box)
329
-
364
+
330
365
  start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
331
366
  start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
332
367
  end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
333
368
  end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
334
-
369
+
335
370
  path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
336
371
  computer_actions.append(make_drag_item(path))
337
-
372
+
338
373
  return computer_actions
339
374
 
340
375
 
@@ -345,33 +380,35 @@ def pil_to_base64(image: Image.Image) -> str:
345
380
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
346
381
 
347
382
 
348
- def process_image_for_uitars(image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS) -> tuple[Image.Image, int, int]:
383
+ def process_image_for_uitars(
384
+ image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
385
+ ) -> tuple[Image.Image, int, int]:
349
386
  """Process image for UITARS model input."""
350
387
  # Decode base64 image
351
- if image_data.startswith('data:image'):
352
- image_data = image_data.split(',')[1]
353
-
388
+ if image_data.startswith("data:image"):
389
+ image_data = image_data.split(",")[1]
390
+
354
391
  image_bytes = base64.b64decode(image_data)
355
392
  image = Image.open(BytesIO(image_bytes))
356
-
393
+
357
394
  original_width, original_height = image.size
358
-
395
+
359
396
  # Resize image according to UITARS requirements
360
397
  if image.width * image.height > max_pixels:
361
398
  resize_factor = math.sqrt(max_pixels / (image.width * image.height))
362
399
  width = int(image.width * resize_factor)
363
400
  height = int(image.height * resize_factor)
364
401
  image = image.resize((width, height))
365
-
402
+
366
403
  if image.width * image.height < min_pixels:
367
404
  resize_factor = math.sqrt(min_pixels / (image.width * image.height))
368
405
  width = math.ceil(image.width * resize_factor)
369
406
  height = math.ceil(image.height * resize_factor)
370
407
  image = image.resize((width, height))
371
-
408
+
372
409
  if image.mode != "RGB":
373
410
  image = image.convert("RGB")
374
-
411
+
375
412
  return image, original_width, original_height
376
413
 
377
414
 
@@ -382,7 +419,11 @@ def sanitize_message(msg: Any) -> Any:
382
419
  for key, value in msg.items():
383
420
  if key == "content" and isinstance(value, list):
384
421
  result[key] = [
385
- {k: v for k, v in item.items() if k != "image_url"} if isinstance(item, dict) else item
422
+ (
423
+ {k: v for k, v in item.items() if k != "image_url"}
424
+ if isinstance(item, dict)
425
+ else item
426
+ )
386
427
  for item in value
387
428
  ]
388
429
  else:
@@ -397,38 +438,41 @@ def sanitize_message(msg: Any) -> Any:
397
438
  def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
398
439
  """
399
440
  Convert UITARS internal message format back to LiteLLM format.
400
-
441
+
401
442
  This function processes reasoning, computer_call, and computer_call_output messages
402
443
  and converts them to the appropriate LiteLLM assistant message format.
403
-
444
+
404
445
  Args:
405
446
  messages: List of UITARS internal messages
406
-
447
+
407
448
  Returns:
408
449
  List of LiteLLM formatted messages
409
450
  """
410
451
  litellm_messages = []
411
452
  current_assistant_content = []
412
-
453
+
413
454
  for message in messages:
414
455
  if isinstance(message, dict):
415
456
  message_type = message.get("type")
416
-
457
+
417
458
  if message_type == "reasoning":
418
459
  # Extract reasoning text from summary
419
460
  summary = message.get("summary", [])
420
461
  if summary and isinstance(summary, list):
421
462
  for summary_item in summary:
422
- if isinstance(summary_item, dict) and summary_item.get("type") == "summary_text":
463
+ if (
464
+ isinstance(summary_item, dict)
465
+ and summary_item.get("type") == "summary_text"
466
+ ):
423
467
  reasoning_text = summary_item.get("text", "")
424
468
  if reasoning_text:
425
469
  current_assistant_content.append(f"Thought: {reasoning_text}")
426
-
470
+
427
471
  elif message_type == "computer_call":
428
472
  # Convert computer action to UITARS action format
429
473
  action = message.get("action", {})
430
474
  action_type = action.get("type")
431
-
475
+
432
476
  if action_type == "click":
433
477
  x, y = action.get("x", 0), action.get("y", 0)
434
478
  button = action.get("button", "left")
@@ -438,59 +482,65 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
438
482
  action_text = f"Action: right_single(start_box='({x},{y})')"
439
483
  else:
440
484
  action_text = f"Action: click(start_box='({x},{y})')"
441
-
485
+
442
486
  elif action_type == "double_click":
443
487
  x, y = action.get("x", 0), action.get("y", 0)
444
488
  action_text = f"Action: left_double(start_box='({x},{y})')"
445
-
489
+
446
490
  elif action_type == "drag":
447
491
  start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
448
492
  end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
449
493
  action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
450
-
494
+
451
495
  elif action_type == "key":
452
496
  key = action.get("key", "")
453
497
  action_text = f"Action: hotkey(key='{key}')"
454
-
498
+
455
499
  elif action_type == "type":
456
500
  text = action.get("text", "")
457
501
  # Escape single quotes in the text
458
502
  escaped_text = escape_single_quotes(text)
459
503
  action_text = f"Action: type(content='{escaped_text}')"
460
-
504
+
461
505
  elif action_type == "scroll":
462
506
  x, y = action.get("x", 0), action.get("y", 0)
463
507
  direction = action.get("direction", "down")
464
508
  action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
465
-
509
+
466
510
  elif action_type == "wait":
467
511
  action_text = "Action: wait()"
468
-
512
+
469
513
  else:
470
514
  # Fallback for unknown action types
471
515
  action_text = f"Action: {action_type}({action})"
472
-
516
+
473
517
  current_assistant_content.append(action_text)
474
-
518
+
475
519
  # When we hit a computer_call_output, finalize the current assistant message
476
520
  if current_assistant_content:
477
- litellm_messages.append({
478
- "role": "assistant",
479
- "content": [{"type": "text", "text": "\n".join(current_assistant_content)}]
480
- })
521
+ litellm_messages.append(
522
+ {
523
+ "role": "assistant",
524
+ "content": [
525
+ {"type": "text", "text": "\n".join(current_assistant_content)}
526
+ ],
527
+ }
528
+ )
481
529
  current_assistant_content = []
482
-
530
+
483
531
  elif message_type == "computer_call_output":
484
532
  # Add screenshot from computer call output
485
533
  output = message.get("output", {})
486
534
  if isinstance(output, dict) and output.get("type") == "input_image":
487
535
  image_url = output.get("image_url", "")
488
536
  if image_url:
489
- litellm_messages.append({
490
- "role": "user",
491
- "content": [{"type": "image_url", "image_url": {"url": image_url}}]
492
- })
493
-
537
+ litellm_messages.append(
538
+ {
539
+ "role": "user",
540
+ "content": [{"type": "image_url", "image_url": {"url": image_url}}],
541
+ }
542
+ )
543
+
494
544
  elif message.get("role") == "user":
495
545
  # # Handle user messages
496
546
  # content = message.get("content", "")
@@ -505,24 +555,27 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
505
555
  # "content": content
506
556
  # })
507
557
  pass
508
-
558
+
509
559
  # Add any remaining assistant content
510
560
  if current_assistant_content:
511
- litellm_messages.append({
512
- "role": "assistant",
513
- "content": current_assistant_content
514
- })
515
-
561
+ litellm_messages.append(
562
+ {
563
+ "role": "assistant",
564
+ "content": [{"type": "text", "text": "\n".join(current_assistant_content)}],
565
+ }
566
+ )
567
+
516
568
  return litellm_messages
517
569
 
518
- @register_agent(models=r"(?i).*ui-?tars.*")
570
+
571
+ @register_agent(models=r"(?i).*ui-?tars.*", priority=-1)
519
572
  class UITARSConfig:
520
573
  """
521
574
  UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
522
-
575
+
523
576
  Supports UITARS vision-language models for computer control.
524
577
  """
525
-
578
+
526
579
  async def predict_step(
527
580
  self,
528
581
  messages: List[Dict[str, Any]],
@@ -536,11 +589,11 @@ class UITARSConfig:
536
589
  _on_api_end=None,
537
590
  _on_usage=None,
538
591
  _on_screenshot=None,
539
- **kwargs
592
+ **kwargs,
540
593
  ) -> Dict[str, Any]:
541
594
  """
542
595
  Predict the next step based on input messages.
543
-
596
+
544
597
  Args:
545
598
  messages: Input messages following Responses format
546
599
  model: Model name to use
@@ -553,22 +606,22 @@ class UITARSConfig:
553
606
  _on_usage: Callback for usage tracking
554
607
  _on_screenshot: Callback for screenshot events
555
608
  **kwargs: Additional arguments
556
-
609
+
557
610
  Returns:
558
611
  Dictionary with "output" (output items) and "usage" array
559
612
  """
560
613
  tools = tools or []
561
-
614
+
562
615
  # Create response items
563
616
  response_items = []
564
-
617
+
565
618
  # Find computer tool for screen dimensions
566
619
  computer_tool = None
567
620
  for tool_schema in tools:
568
621
  if tool_schema["type"] == "computer":
569
622
  computer_tool = tool_schema["computer"]
570
623
  break
571
-
624
+
572
625
  # Get screen dimensions
573
626
  screen_width, screen_height = 1024, 768
574
627
  if computer_tool:
@@ -576,20 +629,20 @@ class UITARSConfig:
576
629
  screen_width, screen_height = await computer_tool.get_dimensions()
577
630
  except:
578
631
  pass
579
-
632
+
580
633
  # Process messages to extract instruction and image
581
634
  instruction = ""
582
635
  image_data = None
583
-
636
+
584
637
  # Convert messages to list if string
585
638
  if isinstance(messages, str):
586
639
  messages = [{"role": "user", "content": messages}]
587
-
640
+
588
641
  # Extract instruction and latest screenshot
589
642
  for message in reversed(messages):
590
643
  if isinstance(message, dict):
591
644
  content = message.get("content", "")
592
-
645
+
593
646
  # Handle different content formats
594
647
  if isinstance(content, str):
595
648
  if not instruction and message.get("role") == "user":
@@ -605,46 +658,41 @@ class UITARSConfig:
605
658
  image_data = image_url.get("url", "")
606
659
  else:
607
660
  image_data = image_url
608
-
661
+
609
662
  # Also check for computer_call_output with screenshots
610
663
  if message.get("type") == "computer_call_output" and not image_data:
611
664
  output = message.get("output", {})
612
665
  if isinstance(output, dict) and output.get("type") == "input_image":
613
666
  image_data = output.get("image_url", "")
614
-
667
+
615
668
  if instruction and image_data:
616
669
  break
617
-
670
+
618
671
  if not instruction:
619
- instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
620
-
672
+ instruction = (
673
+ "Help me complete this task by analyzing the screen and taking appropriate actions."
674
+ )
675
+
621
676
  # Create prompt
622
677
  user_prompt = UITARS_PROMPT_TEMPLATE.format(
623
- instruction=instruction,
624
- action_space=UITARS_ACTION_SPACE,
625
- language="English"
678
+ instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
626
679
  )
627
-
680
+
628
681
  # Convert conversation history to LiteLLM format
629
682
  history_messages = convert_uitars_messages_to_litellm(messages)
630
-
683
+
631
684
  # Prepare messages for liteLLM
632
- litellm_messages = [
633
- {
634
- "role": "system",
635
- "content": "You are a helpful assistant."
636
- }
637
- ]
685
+ litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
638
686
 
639
687
  # Add current user instruction with screenshot
640
688
  current_user_message = {
641
- "role": "user",
689
+ "role": "user",
642
690
  "content": [
643
691
  {"type": "text", "text": user_prompt},
644
- ]
692
+ ],
645
693
  }
646
694
  litellm_messages.append(current_user_message)
647
-
695
+
648
696
  # Process image for UITARS
649
697
  if not image_data:
650
698
  # Take screenshot if none found in messages
@@ -658,17 +706,22 @@ class UITARSConfig:
658
706
  raise ValueError("No screenshot found in messages and no computer_handler provided")
659
707
  processed_image, original_width, original_height = process_image_for_uitars(image_data)
660
708
  encoded_image = pil_to_base64(processed_image)
661
-
709
+
662
710
  # Add conversation history
663
711
  if history_messages:
664
712
  litellm_messages.extend(history_messages)
665
713
  else:
666
- litellm_messages.append({
667
- "role": "user",
668
- "content": [
669
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
670
- ]
671
- })
714
+ litellm_messages.append(
715
+ {
716
+ "role": "user",
717
+ "content": [
718
+ {
719
+ "type": "image_url",
720
+ "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
721
+ }
722
+ ],
723
+ }
724
+ )
672
725
 
673
726
  # Prepare API call kwargs
674
727
  api_kwargs = {
@@ -678,146 +731,143 @@ class UITARSConfig:
678
731
  "temperature": kwargs.get("temperature", 0.0),
679
732
  "do_sample": kwargs.get("temperature", 0.0) > 0.0,
680
733
  "num_retries": max_retries,
681
- **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
734
+ **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
682
735
  }
683
-
736
+
684
737
  # Call API start hook
685
738
  if _on_api_start:
686
739
  await _on_api_start(api_kwargs)
687
-
740
+
688
741
  # Call liteLLM with UITARS model
689
742
  response = await litellm.acompletion(**api_kwargs)
690
-
743
+
691
744
  # Call API end hook
692
745
  if _on_api_end:
693
746
  await _on_api_end(api_kwargs, response)
694
-
747
+
695
748
  # Extract response content
696
- response_content = response.choices[0].message.content.strip() # type: ignore
697
-
749
+ response_content = response.choices[0].message.content.strip() # type: ignore
750
+
698
751
  # Parse UITARS response
699
752
  parsed_responses = parse_uitars_response(response_content, original_width, original_height)
700
-
753
+
701
754
  # Convert to computer actions
702
- computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
703
-
755
+ computer_actions = convert_to_computer_actions(
756
+ parsed_responses, original_width, original_height
757
+ )
758
+
704
759
  # Add computer actions to response items
705
760
  thought = parsed_responses[0].get("thought", "")
706
761
  if thought:
707
762
  response_items.append(make_reasoning_item(thought))
708
763
  response_items.extend(computer_actions)
709
-
764
+
710
765
  # Extract usage information
711
766
  response_usage = {
712
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
767
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
768
+ response.usage
769
+ ).model_dump(),
713
770
  "response_cost": response._hidden_params.get("response_cost", 0.0),
714
771
  }
715
772
  if _on_usage:
716
773
  await _on_usage(response_usage)
717
774
 
718
775
  # Create agent response
719
- agent_response = {
720
- "output": response_items,
721
- "usage": response_usage
722
- }
723
-
776
+ agent_response = {"output": response_items, "usage": response_usage}
777
+
724
778
  return agent_response
725
-
779
+
726
780
  async def predict_click(
727
- self,
728
- model: str,
729
- image_b64: str,
730
- instruction: str
781
+ self, model: str, image_b64: str, instruction: str, **kwargs
731
782
  ) -> Optional[Tuple[int, int]]:
732
783
  """
733
784
  Predict click coordinates based on image and instruction.
734
-
785
+
735
786
  UITARS supports click prediction through its action parsing.
736
-
787
+
737
788
  Args:
738
789
  model: Model name to use
739
790
  image_b64: Base64 encoded image
740
791
  instruction: Instruction for where to click
741
-
792
+
742
793
  Returns:
743
794
  Tuple with (x, y) coordinates or None
744
795
  """
745
796
  try:
746
797
  # Create prompt using grounding template
747
- user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
748
- instruction=instruction
749
- )
750
-
798
+ user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
799
+
751
800
  # Process image for UITARS
752
801
  processed_image, original_width, original_height = process_image_for_uitars(image_b64)
753
802
  encoded_image = pil_to_base64(processed_image)
754
-
803
+
755
804
  # Prepare messages for liteLLM
756
805
  litellm_messages = [
757
- {
758
- "role": "system",
759
- "content": "You are a helpful assistant."
760
- },
806
+ {"role": "system", "content": "You are a helpful assistant."},
761
807
  {
762
808
  "role": "user",
763
809
  "content": [
764
810
  {"type": "text", "text": user_prompt},
765
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
766
- ]
767
- }
811
+ {
812
+ "type": "image_url",
813
+ "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
814
+ },
815
+ ],
816
+ },
768
817
  ]
769
-
818
+
770
819
  # Prepare API call kwargs
771
820
  api_kwargs = {
772
821
  "model": model,
773
822
  "messages": litellm_messages,
774
- "max_tokens": 100,
823
+ "max_tokens": 2056,
775
824
  "temperature": 0.0,
776
- "do_sample": False
825
+ "do_sample": False,
777
826
  }
778
-
827
+ api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
828
+
779
829
  # Call liteLLM with UITARS model
780
830
  response = await litellm.acompletion(**api_kwargs)
781
-
831
+
782
832
  # Extract response content
783
- response_content = response.choices[0].message.content.strip() # type: ignore
784
-
833
+ response_content = response.choices[0].message.content.strip() # type: ignore
834
+
785
835
  print(response_content)
786
836
 
787
837
  # Parse the response to extract click coordinates
788
838
  # Look for click action with coordinates (with special tokens)
789
839
  click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
790
840
  match = re.search(click_pattern, response_content)
791
-
841
+
792
842
  # Fallback: Look for simpler format without special tokens
793
843
  if not match:
794
844
  # Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
795
845
  fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
796
846
  match = re.search(fallback_pattern, response_content)
797
-
847
+
798
848
  if match:
799
849
  x, y = int(match.group(1)), int(match.group(2))
800
850
  # Scale coordinates back to original image dimensions
801
851
  scale_x = original_width / processed_image.width
802
852
  scale_y = original_height / processed_image.height
803
-
853
+
804
854
  scaled_x = int(x * scale_x)
805
855
  scaled_y = int(y * scale_y)
806
-
856
+
807
857
  return (scaled_x, scaled_y)
808
-
858
+
809
859
  return None
810
-
860
+
811
861
  except Exception as e:
812
862
  # Log error and return None
813
863
  print(f"Error in predict_click: {e}")
814
864
  return None
815
-
865
+
816
866
  def get_capabilities(self) -> List[AgentCapability]:
817
867
  """
818
868
  Get list of capabilities supported by this agent config.
819
-
869
+
820
870
  Returns:
821
871
  List of capability strings
822
872
  """
823
- return ["step", "click"]
873
+ return ["step", "click"]