cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py CHANGED
@@ -4,39 +4,50 @@ Paper: https://arxiv.org/abs/2501.12326
4
4
  Code: https://github.com/bytedance/UI-TARS
5
5
  """
6
6
 
7
+ import ast
7
8
  import asyncio
8
- from ctypes import cast
9
- import json
10
9
  import base64
10
+ import json
11
11
  import math
12
12
  import re
13
- import ast
14
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
13
+ from ctypes import cast
15
14
  from io import BytesIO
16
- from PIL import Image
15
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
16
+
17
17
  import litellm
18
- from litellm.types.utils import ModelResponse
19
- from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
18
+ from litellm.responses.litellm_completion_transformation.transformation import (
19
+ LiteLLMCompletionResponsesConfig,
20
+ )
20
21
  from litellm.responses.utils import Usage
21
- from openai.types.responses.response_computer_tool_call_param import ActionType, ResponseComputerToolCallParam
22
+ from litellm.types.utils import ModelResponse
23
+ from openai.types.responses.response_computer_tool_call_param import (
24
+ ActionType,
25
+ ResponseComputerToolCallParam,
26
+ )
22
27
  from openai.types.responses.response_input_param import ComputerCallOutput
23
- from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
24
- from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
28
+ from openai.types.responses.response_output_message_param import (
29
+ ResponseOutputMessageParam,
30
+ )
31
+ from openai.types.responses.response_reasoning_item_param import (
32
+ ResponseReasoningItemParam,
33
+ Summary,
34
+ )
35
+ from PIL import Image
25
36
 
26
37
  from ..decorators import register_agent
27
- from ..types import Messages, AgentResponse, Tools, AgentCapability
28
38
  from ..responses import (
29
- make_reasoning_item,
30
- make_output_text_item,
31
39
  make_click_item,
32
40
  make_double_click_item,
33
41
  make_drag_item,
42
+ make_input_image_item,
34
43
  make_keypress_item,
44
+ make_output_text_item,
45
+ make_reasoning_item,
35
46
  make_scroll_item,
36
47
  make_type_item,
37
48
  make_wait_item,
38
- make_input_image_item
39
49
  )
50
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
40
51
 
41
52
  # Constants from reference code
42
53
  IMAGE_FACTOR = 28
@@ -94,6 +105,7 @@ click(point='<|box_start|>(x1,y1)<|box_end|>')
94
105
  ## User Instruction
95
106
  {instruction}"""
96
107
 
108
+
97
109
  def round_by_factor(number: float, factor: int) -> int:
98
110
  """Returns the closest integer to 'number' that is divisible by 'factor'."""
99
111
  return round(number / factor) * factor
@@ -110,7 +122,11 @@ def floor_by_factor(number: float, factor: int) -> int:
110
122
 
111
123
 
112
124
  def smart_resize(
113
- height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
125
+ height: int,
126
+ width: int,
127
+ factor: int = IMAGE_FACTOR,
128
+ min_pixels: int = MIN_PIXELS,
129
+ max_pixels: int = MAX_PIXELS,
114
130
  ) -> tuple[int, int]:
115
131
  """
116
132
  Rescales the image so that the following conditions are met:
@@ -144,14 +160,14 @@ def escape_single_quotes(text):
144
160
  def parse_action(action_str):
145
161
  """Parse action string into structured format."""
146
162
  try:
147
- node = ast.parse(action_str, mode='eval')
163
+ node = ast.parse(action_str, mode="eval")
148
164
  if not isinstance(node, ast.Expression):
149
165
  raise ValueError("Not an expression")
150
-
166
+
151
167
  call = node.body
152
168
  if not isinstance(call, ast.Call):
153
169
  raise ValueError("Not a function call")
154
-
170
+
155
171
  # Get function name
156
172
  if isinstance(call.func, ast.Name):
157
173
  func_name = call.func.id
@@ -159,7 +175,7 @@ def parse_action(action_str):
159
175
  func_name = call.func.attr
160
176
  else:
161
177
  func_name = None
162
-
178
+
163
179
  # Get keyword arguments
164
180
  kwargs = {}
165
181
  for kw in call.keywords:
@@ -171,12 +187,9 @@ def parse_action(action_str):
171
187
  else:
172
188
  value = None
173
189
  kwargs[key] = value
174
-
175
- return {
176
- 'function': func_name,
177
- 'args': kwargs
178
- }
179
-
190
+
191
+ return {"function": func_name, "args": kwargs}
192
+
180
193
  except Exception as e:
181
194
  print(f"Failed to parse action '{action_str}': {e}")
182
195
  return None
@@ -185,39 +198,39 @@ def parse_action(action_str):
185
198
  def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
186
199
  """Parse UITARS model response into structured actions."""
187
200
  text = text.strip()
188
-
201
+
189
202
  # Extract thought
190
203
  thought = None
191
204
  if text.startswith("Thought:"):
192
205
  thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
193
206
  if thought_match:
194
207
  thought = thought_match.group(1).strip()
195
-
208
+
196
209
  # Extract action
197
210
  if "Action:" not in text:
198
211
  raise ValueError("No Action found in response")
199
-
212
+
200
213
  action_str = text.split("Action:")[-1].strip()
201
214
 
202
215
  # Handle special case for type actions
203
216
  if "type(content" in action_str:
217
+
204
218
  def escape_quotes(match):
205
219
  return match.group(1)
206
-
220
+
207
221
  pattern = r"type\(content='(.*?)'\)"
208
222
  content = re.sub(pattern, escape_quotes, action_str)
209
223
  action_str = escape_single_quotes(content)
210
224
  action_str = "type(content='" + action_str + "')"
211
-
212
-
225
+
213
226
  # Parse the action
214
227
  parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
215
228
  if parsed_action is None:
216
229
  raise ValueError(f"Action can't parse: {action_str}")
217
-
230
+
218
231
  action_type = parsed_action["function"]
219
232
  params = parsed_action["args"]
220
-
233
+
221
234
  # Process parameters
222
235
  action_inputs = {}
223
236
  for param_name, param in params.items():
@@ -225,7 +238,7 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
225
238
  continue
226
239
  param = str(param).lstrip()
227
240
  action_inputs[param_name.strip()] = param
228
-
241
+
229
242
  # Handle coordinate parameters
230
243
  if "start_box" in param_name or "end_box" in param_name:
231
244
  # Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
@@ -233,117 +246,130 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
233
246
  clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
234
247
  # Then remove parentheses and split
235
248
  numbers = clean_param.replace("(", "").replace(")", "").split(",")
236
-
249
+
237
250
  try:
238
- float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
239
-
251
+ float_numbers = [
252
+ float(num.strip()) / 1000 for num in numbers
253
+ ] # Normalize to 0-1 range
254
+
240
255
  if len(float_numbers) == 2:
241
256
  # Single point, duplicate for box format
242
- float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
243
-
257
+ float_numbers = [
258
+ float_numbers[0],
259
+ float_numbers[1],
260
+ float_numbers[0],
261
+ float_numbers[1],
262
+ ]
263
+
244
264
  action_inputs[param_name.strip()] = str(float_numbers)
245
265
  except ValueError as e:
246
266
  # If parsing fails, keep the original parameter value
247
267
  print(f"Warning: Could not parse coordinates '{param}': {e}")
248
268
  action_inputs[param_name.strip()] = param
249
-
250
- return [{
251
- "thought": thought,
252
- "action_type": action_type,
253
- "action_inputs": action_inputs,
254
- "text": text
255
- }]
256
269
 
270
+ return [
271
+ {
272
+ "thought": thought,
273
+ "action_type": action_type,
274
+ "action_inputs": action_inputs,
275
+ "text": text,
276
+ }
277
+ ]
257
278
 
258
- def convert_to_computer_actions(parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
279
+
280
+ def convert_to_computer_actions(
281
+ parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
282
+ ) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
259
283
  """Convert parsed UITARS responses to computer actions."""
260
284
  computer_actions = []
261
-
285
+
262
286
  for response in parsed_responses:
263
287
  action_type = response.get("action_type")
264
288
  action_inputs = response.get("action_inputs", {})
265
-
289
+
266
290
  if action_type == "finished":
267
291
  finished_text = action_inputs.get("content", "Task completed successfully.")
268
292
  computer_actions.append(make_output_text_item(finished_text))
269
293
  break
270
-
294
+
271
295
  elif action_type == "wait":
272
296
  computer_actions.append(make_wait_item())
273
-
297
+
274
298
  elif action_type == "call_user":
275
- computer_actions.append(make_output_text_item("I need assistance from the user to proceed with this task."))
276
-
299
+ computer_actions.append(
300
+ make_output_text_item("I need assistance from the user to proceed with this task.")
301
+ )
302
+
277
303
  elif action_type in ["click", "left_single"]:
278
304
  start_box = action_inputs.get("start_box")
279
305
  if start_box:
280
306
  coords = eval(start_box)
281
307
  x = int((coords[0] + coords[2]) / 2 * image_width)
282
308
  y = int((coords[1] + coords[3]) / 2 * image_height)
283
-
309
+
284
310
  computer_actions.append(make_click_item(x, y, "left"))
285
-
286
- elif action_type == "double_click":
311
+
312
+ elif action_type in ["double_click", "left_double"]:
287
313
  start_box = action_inputs.get("start_box")
288
314
  if start_box:
289
315
  coords = eval(start_box)
290
316
  x = int((coords[0] + coords[2]) / 2 * image_width)
291
317
  y = int((coords[1] + coords[3]) / 2 * image_height)
292
-
318
+
293
319
  computer_actions.append(make_double_click_item(x, y))
294
-
295
- elif action_type == "right_click":
320
+
321
+ elif action_type in ["right_click", "right_single"]:
296
322
  start_box = action_inputs.get("start_box")
297
323
  if start_box:
298
324
  coords = eval(start_box)
299
325
  x = int((coords[0] + coords[2]) / 2 * image_width)
300
326
  y = int((coords[1] + coords[3]) / 2 * image_height)
301
-
327
+
302
328
  computer_actions.append(make_click_item(x, y, "right"))
303
-
329
+
304
330
  elif action_type == "type":
305
331
  content = action_inputs.get("content", "")
306
332
  computer_actions.append(make_type_item(content))
307
-
333
+
308
334
  elif action_type == "hotkey":
309
335
  key = action_inputs.get("key", "")
310
336
  keys = key.split()
311
337
  computer_actions.append(make_keypress_item(keys))
312
-
338
+
313
339
  elif action_type == "press":
314
340
  key = action_inputs.get("key", "")
315
341
  computer_actions.append(make_keypress_item([key]))
316
-
342
+
317
343
  elif action_type == "scroll":
318
344
  start_box = action_inputs.get("start_box")
319
345
  direction = action_inputs.get("direction", "down")
320
-
346
+
321
347
  if start_box:
322
348
  coords = eval(start_box)
323
349
  x = int((coords[0] + coords[2]) / 2 * image_width)
324
350
  y = int((coords[1] + coords[3]) / 2 * image_height)
325
351
  else:
326
352
  x, y = image_width // 2, image_height // 2
327
-
353
+
328
354
  scroll_y = 5 if "up" in direction.lower() else -5
329
355
  computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
330
-
356
+
331
357
  elif action_type == "drag":
332
358
  start_box = action_inputs.get("start_box")
333
359
  end_box = action_inputs.get("end_box")
334
-
360
+
335
361
  if start_box and end_box:
336
362
  start_coords = eval(start_box)
337
363
  end_coords = eval(end_box)
338
-
364
+
339
365
  start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
340
366
  start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
341
367
  end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
342
368
  end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
343
-
369
+
344
370
  path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
345
371
  computer_actions.append(make_drag_item(path))
346
-
372
+
347
373
  return computer_actions
348
374
 
349
375
 
@@ -354,33 +380,35 @@ def pil_to_base64(image: Image.Image) -> str:
354
380
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
355
381
 
356
382
 
357
- def process_image_for_uitars(image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS) -> tuple[Image.Image, int, int]:
383
+ def process_image_for_uitars(
384
+ image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
385
+ ) -> tuple[Image.Image, int, int]:
358
386
  """Process image for UITARS model input."""
359
387
  # Decode base64 image
360
- if image_data.startswith('data:image'):
361
- image_data = image_data.split(',')[1]
362
-
388
+ if image_data.startswith("data:image"):
389
+ image_data = image_data.split(",")[1]
390
+
363
391
  image_bytes = base64.b64decode(image_data)
364
392
  image = Image.open(BytesIO(image_bytes))
365
-
393
+
366
394
  original_width, original_height = image.size
367
-
395
+
368
396
  # Resize image according to UITARS requirements
369
397
  if image.width * image.height > max_pixels:
370
398
  resize_factor = math.sqrt(max_pixels / (image.width * image.height))
371
399
  width = int(image.width * resize_factor)
372
400
  height = int(image.height * resize_factor)
373
401
  image = image.resize((width, height))
374
-
402
+
375
403
  if image.width * image.height < min_pixels:
376
404
  resize_factor = math.sqrt(min_pixels / (image.width * image.height))
377
405
  width = math.ceil(image.width * resize_factor)
378
406
  height = math.ceil(image.height * resize_factor)
379
407
  image = image.resize((width, height))
380
-
408
+
381
409
  if image.mode != "RGB":
382
410
  image = image.convert("RGB")
383
-
411
+
384
412
  return image, original_width, original_height
385
413
 
386
414
 
@@ -391,7 +419,11 @@ def sanitize_message(msg: Any) -> Any:
391
419
  for key, value in msg.items():
392
420
  if key == "content" and isinstance(value, list):
393
421
  result[key] = [
394
- {k: v for k, v in item.items() if k != "image_url"} if isinstance(item, dict) else item
422
+ (
423
+ {k: v for k, v in item.items() if k != "image_url"}
424
+ if isinstance(item, dict)
425
+ else item
426
+ )
395
427
  for item in value
396
428
  ]
397
429
  else:
@@ -406,38 +438,41 @@ def sanitize_message(msg: Any) -> Any:
406
438
  def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
407
439
  """
408
440
  Convert UITARS internal message format back to LiteLLM format.
409
-
441
+
410
442
  This function processes reasoning, computer_call, and computer_call_output messages
411
443
  and converts them to the appropriate LiteLLM assistant message format.
412
-
444
+
413
445
  Args:
414
446
  messages: List of UITARS internal messages
415
-
447
+
416
448
  Returns:
417
449
  List of LiteLLM formatted messages
418
450
  """
419
451
  litellm_messages = []
420
452
  current_assistant_content = []
421
-
453
+
422
454
  for message in messages:
423
455
  if isinstance(message, dict):
424
456
  message_type = message.get("type")
425
-
457
+
426
458
  if message_type == "reasoning":
427
459
  # Extract reasoning text from summary
428
460
  summary = message.get("summary", [])
429
461
  if summary and isinstance(summary, list):
430
462
  for summary_item in summary:
431
- if isinstance(summary_item, dict) and summary_item.get("type") == "summary_text":
463
+ if (
464
+ isinstance(summary_item, dict)
465
+ and summary_item.get("type") == "summary_text"
466
+ ):
432
467
  reasoning_text = summary_item.get("text", "")
433
468
  if reasoning_text:
434
469
  current_assistant_content.append(f"Thought: {reasoning_text}")
435
-
470
+
436
471
  elif message_type == "computer_call":
437
472
  # Convert computer action to UITARS action format
438
473
  action = message.get("action", {})
439
474
  action_type = action.get("type")
440
-
475
+
441
476
  if action_type == "click":
442
477
  x, y = action.get("x", 0), action.get("y", 0)
443
478
  button = action.get("button", "left")
@@ -447,59 +482,65 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
447
482
  action_text = f"Action: right_single(start_box='({x},{y})')"
448
483
  else:
449
484
  action_text = f"Action: click(start_box='({x},{y})')"
450
-
485
+
451
486
  elif action_type == "double_click":
452
487
  x, y = action.get("x", 0), action.get("y", 0)
453
488
  action_text = f"Action: left_double(start_box='({x},{y})')"
454
-
489
+
455
490
  elif action_type == "drag":
456
491
  start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
457
492
  end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
458
493
  action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
459
-
494
+
460
495
  elif action_type == "key":
461
496
  key = action.get("key", "")
462
497
  action_text = f"Action: hotkey(key='{key}')"
463
-
498
+
464
499
  elif action_type == "type":
465
500
  text = action.get("text", "")
466
501
  # Escape single quotes in the text
467
502
  escaped_text = escape_single_quotes(text)
468
503
  action_text = f"Action: type(content='{escaped_text}')"
469
-
504
+
470
505
  elif action_type == "scroll":
471
506
  x, y = action.get("x", 0), action.get("y", 0)
472
507
  direction = action.get("direction", "down")
473
508
  action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
474
-
509
+
475
510
  elif action_type == "wait":
476
511
  action_text = "Action: wait()"
477
-
512
+
478
513
  else:
479
514
  # Fallback for unknown action types
480
515
  action_text = f"Action: {action_type}({action})"
481
-
516
+
482
517
  current_assistant_content.append(action_text)
483
-
518
+
484
519
  # When we hit a computer_call_output, finalize the current assistant message
485
520
  if current_assistant_content:
486
- litellm_messages.append({
487
- "role": "assistant",
488
- "content": [{"type": "text", "text": "\n".join(current_assistant_content)}]
489
- })
521
+ litellm_messages.append(
522
+ {
523
+ "role": "assistant",
524
+ "content": [
525
+ {"type": "text", "text": "\n".join(current_assistant_content)}
526
+ ],
527
+ }
528
+ )
490
529
  current_assistant_content = []
491
-
530
+
492
531
  elif message_type == "computer_call_output":
493
532
  # Add screenshot from computer call output
494
533
  output = message.get("output", {})
495
534
  if isinstance(output, dict) and output.get("type") == "input_image":
496
535
  image_url = output.get("image_url", "")
497
536
  if image_url:
498
- litellm_messages.append({
499
- "role": "user",
500
- "content": [{"type": "image_url", "image_url": {"url": image_url}}]
501
- })
502
-
537
+ litellm_messages.append(
538
+ {
539
+ "role": "user",
540
+ "content": [{"type": "image_url", "image_url": {"url": image_url}}],
541
+ }
542
+ )
543
+
503
544
  elif message.get("role") == "user":
504
545
  # # Handle user messages
505
546
  # content = message.get("content", "")
@@ -514,24 +555,27 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
514
555
  # "content": content
515
556
  # })
516
557
  pass
517
-
558
+
518
559
  # Add any remaining assistant content
519
560
  if current_assistant_content:
520
- litellm_messages.append({
521
- "role": "assistant",
522
- "content": current_assistant_content
523
- })
524
-
561
+ litellm_messages.append(
562
+ {
563
+ "role": "assistant",
564
+ "content": [{"type": "text", "text": "\n".join(current_assistant_content)}],
565
+ }
566
+ )
567
+
525
568
  return litellm_messages
526
569
 
527
- @register_agent(models=r"(?i).*ui-?tars.*")
570
+
571
+ @register_agent(models=r"(?i).*ui-?tars.*", priority=-1)
528
572
  class UITARSConfig:
529
573
  """
530
574
  UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
531
-
575
+
532
576
  Supports UITARS vision-language models for computer control.
533
577
  """
534
-
578
+
535
579
  async def predict_step(
536
580
  self,
537
581
  messages: List[Dict[str, Any]],
@@ -545,11 +589,11 @@ class UITARSConfig:
545
589
  _on_api_end=None,
546
590
  _on_usage=None,
547
591
  _on_screenshot=None,
548
- **kwargs
592
+ **kwargs,
549
593
  ) -> Dict[str, Any]:
550
594
  """
551
595
  Predict the next step based on input messages.
552
-
596
+
553
597
  Args:
554
598
  messages: Input messages following Responses format
555
599
  model: Model name to use
@@ -562,22 +606,22 @@ class UITARSConfig:
562
606
  _on_usage: Callback for usage tracking
563
607
  _on_screenshot: Callback for screenshot events
564
608
  **kwargs: Additional arguments
565
-
609
+
566
610
  Returns:
567
611
  Dictionary with "output" (output items) and "usage" array
568
612
  """
569
613
  tools = tools or []
570
-
614
+
571
615
  # Create response items
572
616
  response_items = []
573
-
617
+
574
618
  # Find computer tool for screen dimensions
575
619
  computer_tool = None
576
620
  for tool_schema in tools:
577
621
  if tool_schema["type"] == "computer":
578
622
  computer_tool = tool_schema["computer"]
579
623
  break
580
-
624
+
581
625
  # Get screen dimensions
582
626
  screen_width, screen_height = 1024, 768
583
627
  if computer_tool:
@@ -585,20 +629,20 @@ class UITARSConfig:
585
629
  screen_width, screen_height = await computer_tool.get_dimensions()
586
630
  except:
587
631
  pass
588
-
632
+
589
633
  # Process messages to extract instruction and image
590
634
  instruction = ""
591
635
  image_data = None
592
-
636
+
593
637
  # Convert messages to list if string
594
638
  if isinstance(messages, str):
595
639
  messages = [{"role": "user", "content": messages}]
596
-
640
+
597
641
  # Extract instruction and latest screenshot
598
642
  for message in reversed(messages):
599
643
  if isinstance(message, dict):
600
644
  content = message.get("content", "")
601
-
645
+
602
646
  # Handle different content formats
603
647
  if isinstance(content, str):
604
648
  if not instruction and message.get("role") == "user":
@@ -614,46 +658,41 @@ class UITARSConfig:
614
658
  image_data = image_url.get("url", "")
615
659
  else:
616
660
  image_data = image_url
617
-
661
+
618
662
  # Also check for computer_call_output with screenshots
619
663
  if message.get("type") == "computer_call_output" and not image_data:
620
664
  output = message.get("output", {})
621
665
  if isinstance(output, dict) and output.get("type") == "input_image":
622
666
  image_data = output.get("image_url", "")
623
-
667
+
624
668
  if instruction and image_data:
625
669
  break
626
-
670
+
627
671
  if not instruction:
628
- instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
629
-
672
+ instruction = (
673
+ "Help me complete this task by analyzing the screen and taking appropriate actions."
674
+ )
675
+
630
676
  # Create prompt
631
677
  user_prompt = UITARS_PROMPT_TEMPLATE.format(
632
- instruction=instruction,
633
- action_space=UITARS_ACTION_SPACE,
634
- language="English"
678
+ instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
635
679
  )
636
-
680
+
637
681
  # Convert conversation history to LiteLLM format
638
682
  history_messages = convert_uitars_messages_to_litellm(messages)
639
-
683
+
640
684
  # Prepare messages for liteLLM
641
- litellm_messages = [
642
- {
643
- "role": "system",
644
- "content": "You are a helpful assistant."
645
- }
646
- ]
685
+ litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
647
686
 
648
687
  # Add current user instruction with screenshot
649
688
  current_user_message = {
650
- "role": "user",
689
+ "role": "user",
651
690
  "content": [
652
691
  {"type": "text", "text": user_prompt},
653
- ]
692
+ ],
654
693
  }
655
694
  litellm_messages.append(current_user_message)
656
-
695
+
657
696
  # Process image for UITARS
658
697
  if not image_data:
659
698
  # Take screenshot if none found in messages
@@ -667,17 +706,22 @@ class UITARSConfig:
667
706
  raise ValueError("No screenshot found in messages and no computer_handler provided")
668
707
  processed_image, original_width, original_height = process_image_for_uitars(image_data)
669
708
  encoded_image = pil_to_base64(processed_image)
670
-
709
+
671
710
  # Add conversation history
672
711
  if history_messages:
673
712
  litellm_messages.extend(history_messages)
674
713
  else:
675
- litellm_messages.append({
676
- "role": "user",
677
- "content": [
678
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
679
- ]
680
- })
714
+ litellm_messages.append(
715
+ {
716
+ "role": "user",
717
+ "content": [
718
+ {
719
+ "type": "image_url",
720
+ "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
721
+ }
722
+ ],
723
+ }
724
+ )
681
725
 
682
726
  # Prepare API call kwargs
683
727
  api_kwargs = {
@@ -687,146 +731,143 @@ class UITARSConfig:
687
731
  "temperature": kwargs.get("temperature", 0.0),
688
732
  "do_sample": kwargs.get("temperature", 0.0) > 0.0,
689
733
  "num_retries": max_retries,
690
- **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
734
+ **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
691
735
  }
692
-
736
+
693
737
  # Call API start hook
694
738
  if _on_api_start:
695
739
  await _on_api_start(api_kwargs)
696
-
740
+
697
741
  # Call liteLLM with UITARS model
698
742
  response = await litellm.acompletion(**api_kwargs)
699
-
743
+
700
744
  # Call API end hook
701
745
  if _on_api_end:
702
746
  await _on_api_end(api_kwargs, response)
703
-
747
+
704
748
  # Extract response content
705
- response_content = response.choices[0].message.content.strip() # type: ignore
706
-
749
+ response_content = response.choices[0].message.content.strip() # type: ignore
750
+
707
751
  # Parse UITARS response
708
752
  parsed_responses = parse_uitars_response(response_content, original_width, original_height)
709
-
753
+
710
754
  # Convert to computer actions
711
- computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
712
-
755
+ computer_actions = convert_to_computer_actions(
756
+ parsed_responses, original_width, original_height
757
+ )
758
+
713
759
  # Add computer actions to response items
714
760
  thought = parsed_responses[0].get("thought", "")
715
761
  if thought:
716
762
  response_items.append(make_reasoning_item(thought))
717
763
  response_items.extend(computer_actions)
718
-
764
+
719
765
  # Extract usage information
720
766
  response_usage = {
721
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
767
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
768
+ response.usage
769
+ ).model_dump(),
722
770
  "response_cost": response._hidden_params.get("response_cost", 0.0),
723
771
  }
724
772
  if _on_usage:
725
773
  await _on_usage(response_usage)
726
774
 
727
775
  # Create agent response
728
- agent_response = {
729
- "output": response_items,
730
- "usage": response_usage
731
- }
732
-
776
+ agent_response = {"output": response_items, "usage": response_usage}
777
+
733
778
  return agent_response
734
-
779
+
735
780
  async def predict_click(
736
- self,
737
- model: str,
738
- image_b64: str,
739
- instruction: str
781
+ self, model: str, image_b64: str, instruction: str, **kwargs
740
782
  ) -> Optional[Tuple[int, int]]:
741
783
  """
742
784
  Predict click coordinates based on image and instruction.
743
-
785
+
744
786
  UITARS supports click prediction through its action parsing.
745
-
787
+
746
788
  Args:
747
789
  model: Model name to use
748
790
  image_b64: Base64 encoded image
749
791
  instruction: Instruction for where to click
750
-
792
+
751
793
  Returns:
752
794
  Tuple with (x, y) coordinates or None
753
795
  """
754
796
  try:
755
797
  # Create prompt using grounding template
756
- user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
757
- instruction=instruction
758
- )
759
-
798
+ user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
799
+
760
800
  # Process image for UITARS
761
801
  processed_image, original_width, original_height = process_image_for_uitars(image_b64)
762
802
  encoded_image = pil_to_base64(processed_image)
763
-
803
+
764
804
  # Prepare messages for liteLLM
765
805
  litellm_messages = [
766
- {
767
- "role": "system",
768
- "content": "You are a helpful assistant."
769
- },
806
+ {"role": "system", "content": "You are a helpful assistant."},
770
807
  {
771
808
  "role": "user",
772
809
  "content": [
773
810
  {"type": "text", "text": user_prompt},
774
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
775
- ]
776
- }
811
+ {
812
+ "type": "image_url",
813
+ "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
814
+ },
815
+ ],
816
+ },
777
817
  ]
778
-
818
+
779
819
  # Prepare API call kwargs
780
820
  api_kwargs = {
781
821
  "model": model,
782
822
  "messages": litellm_messages,
783
- "max_tokens": 100,
823
+ "max_tokens": 2056,
784
824
  "temperature": 0.0,
785
- "do_sample": False
825
+ "do_sample": False,
786
826
  }
787
-
827
+ api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
828
+
788
829
  # Call liteLLM with UITARS model
789
830
  response = await litellm.acompletion(**api_kwargs)
790
-
831
+
791
832
  # Extract response content
792
- response_content = response.choices[0].message.content.strip() # type: ignore
793
-
833
+ response_content = response.choices[0].message.content.strip() # type: ignore
834
+
794
835
  print(response_content)
795
836
 
796
837
  # Parse the response to extract click coordinates
797
838
  # Look for click action with coordinates (with special tokens)
798
839
  click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
799
840
  match = re.search(click_pattern, response_content)
800
-
841
+
801
842
  # Fallback: Look for simpler format without special tokens
802
843
  if not match:
803
844
  # Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
804
845
  fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
805
846
  match = re.search(fallback_pattern, response_content)
806
-
847
+
807
848
  if match:
808
849
  x, y = int(match.group(1)), int(match.group(2))
809
850
  # Scale coordinates back to original image dimensions
810
851
  scale_x = original_width / processed_image.width
811
852
  scale_y = original_height / processed_image.height
812
-
853
+
813
854
  scaled_x = int(x * scale_x)
814
855
  scaled_y = int(y * scale_y)
815
-
856
+
816
857
  return (scaled_x, scaled_y)
817
-
858
+
818
859
  return None
819
-
860
+
820
861
  except Exception as e:
821
862
  # Log error and return None
822
863
  print(f"Error in predict_click: {e}")
823
864
  return None
824
-
865
+
825
866
  def get_capabilities(self) -> List[AgentCapability]:
826
867
  """
827
868
  Get list of capabilities supported by this agent config.
828
-
869
+
829
870
  Returns:
830
871
  List of capability strings
831
872
  """
832
- return ["step", "click"]
873
+ return ["step", "click"]