cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +32 -19
  24. agent/computers/cua.py +33 -25
  25. agent/computers/custom.py +78 -71
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +215 -210
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +510 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.36.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py CHANGED
@@ -4,39 +4,50 @@ Paper: https://arxiv.org/abs/2501.12326
4
4
  Code: https://github.com/bytedance/UI-TARS
5
5
  """
6
6
 
7
+ import ast
7
8
  import asyncio
8
- from ctypes import cast
9
- import json
10
9
  import base64
10
+ import json
11
11
  import math
12
12
  import re
13
- import ast
14
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
13
+ from ctypes import cast
15
14
  from io import BytesIO
16
- from PIL import Image
15
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
16
+
17
17
  import litellm
18
- from litellm.types.utils import ModelResponse
19
- from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
18
+ from litellm.responses.litellm_completion_transformation.transformation import (
19
+ LiteLLMCompletionResponsesConfig,
20
+ )
20
21
  from litellm.responses.utils import Usage
21
- from openai.types.responses.response_computer_tool_call_param import ActionType, ResponseComputerToolCallParam
22
+ from litellm.types.utils import ModelResponse
23
+ from openai.types.responses.response_computer_tool_call_param import (
24
+ ActionType,
25
+ ResponseComputerToolCallParam,
26
+ )
22
27
  from openai.types.responses.response_input_param import ComputerCallOutput
23
- from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
24
- from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
28
+ from openai.types.responses.response_output_message_param import (
29
+ ResponseOutputMessageParam,
30
+ )
31
+ from openai.types.responses.response_reasoning_item_param import (
32
+ ResponseReasoningItemParam,
33
+ Summary,
34
+ )
35
+ from PIL import Image
25
36
 
26
37
  from ..decorators import register_agent
27
- from ..types import Messages, AgentResponse, Tools, AgentCapability
28
38
  from ..responses import (
29
- make_reasoning_item,
30
- make_output_text_item,
31
39
  make_click_item,
32
40
  make_double_click_item,
33
41
  make_drag_item,
42
+ make_input_image_item,
34
43
  make_keypress_item,
44
+ make_output_text_item,
45
+ make_reasoning_item,
35
46
  make_scroll_item,
36
47
  make_type_item,
37
48
  make_wait_item,
38
- make_input_image_item
39
49
  )
50
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
40
51
 
41
52
  # Constants from reference code
42
53
  IMAGE_FACTOR = 28
@@ -94,6 +105,7 @@ click(point='<|box_start|>(x1,y1)<|box_end|>')
94
105
  ## User Instruction
95
106
  {instruction}"""
96
107
 
108
+
97
109
  def round_by_factor(number: float, factor: int) -> int:
98
110
  """Returns the closest integer to 'number' that is divisible by 'factor'."""
99
111
  return round(number / factor) * factor
@@ -110,7 +122,11 @@ def floor_by_factor(number: float, factor: int) -> int:
110
122
 
111
123
 
112
124
  def smart_resize(
113
- height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
125
+ height: int,
126
+ width: int,
127
+ factor: int = IMAGE_FACTOR,
128
+ min_pixels: int = MIN_PIXELS,
129
+ max_pixels: int = MAX_PIXELS,
114
130
  ) -> tuple[int, int]:
115
131
  """
116
132
  Rescales the image so that the following conditions are met:
@@ -144,14 +160,14 @@ def escape_single_quotes(text):
144
160
  def parse_action(action_str):
145
161
  """Parse action string into structured format."""
146
162
  try:
147
- node = ast.parse(action_str, mode='eval')
163
+ node = ast.parse(action_str, mode="eval")
148
164
  if not isinstance(node, ast.Expression):
149
165
  raise ValueError("Not an expression")
150
-
166
+
151
167
  call = node.body
152
168
  if not isinstance(call, ast.Call):
153
169
  raise ValueError("Not a function call")
154
-
170
+
155
171
  # Get function name
156
172
  if isinstance(call.func, ast.Name):
157
173
  func_name = call.func.id
@@ -159,7 +175,7 @@ def parse_action(action_str):
159
175
  func_name = call.func.attr
160
176
  else:
161
177
  func_name = None
162
-
178
+
163
179
  # Get keyword arguments
164
180
  kwargs = {}
165
181
  for kw in call.keywords:
@@ -171,12 +187,9 @@ def parse_action(action_str):
171
187
  else:
172
188
  value = None
173
189
  kwargs[key] = value
174
-
175
- return {
176
- 'function': func_name,
177
- 'args': kwargs
178
- }
179
-
190
+
191
+ return {"function": func_name, "args": kwargs}
192
+
180
193
  except Exception as e:
181
194
  print(f"Failed to parse action '{action_str}': {e}")
182
195
  return None
@@ -185,39 +198,39 @@ def parse_action(action_str):
185
198
  def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
186
199
  """Parse UITARS model response into structured actions."""
187
200
  text = text.strip()
188
-
201
+
189
202
  # Extract thought
190
203
  thought = None
191
204
  if text.startswith("Thought:"):
192
205
  thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
193
206
  if thought_match:
194
207
  thought = thought_match.group(1).strip()
195
-
208
+
196
209
  # Extract action
197
210
  if "Action:" not in text:
198
211
  raise ValueError("No Action found in response")
199
-
212
+
200
213
  action_str = text.split("Action:")[-1].strip()
201
214
 
202
215
  # Handle special case for type actions
203
216
  if "type(content" in action_str:
217
+
204
218
  def escape_quotes(match):
205
219
  return match.group(1)
206
-
220
+
207
221
  pattern = r"type\(content='(.*?)'\)"
208
222
  content = re.sub(pattern, escape_quotes, action_str)
209
223
  action_str = escape_single_quotes(content)
210
224
  action_str = "type(content='" + action_str + "')"
211
-
212
-
225
+
213
226
  # Parse the action
214
227
  parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
215
228
  if parsed_action is None:
216
229
  raise ValueError(f"Action can't parse: {action_str}")
217
-
230
+
218
231
  action_type = parsed_action["function"]
219
232
  params = parsed_action["args"]
220
-
233
+
221
234
  # Process parameters
222
235
  action_inputs = {}
223
236
  for param_name, param in params.items():
@@ -225,7 +238,7 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
225
238
  continue
226
239
  param = str(param).lstrip()
227
240
  action_inputs[param_name.strip()] = param
228
-
241
+
229
242
  # Handle coordinate parameters
230
243
  if "start_box" in param_name or "end_box" in param_name:
231
244
  # Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
@@ -233,117 +246,130 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
233
246
  clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
234
247
  # Then remove parentheses and split
235
248
  numbers = clean_param.replace("(", "").replace(")", "").split(",")
236
-
249
+
237
250
  try:
238
- float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
239
-
251
+ float_numbers = [
252
+ float(num.strip()) / 1000 for num in numbers
253
+ ] # Normalize to 0-1 range
254
+
240
255
  if len(float_numbers) == 2:
241
256
  # Single point, duplicate for box format
242
- float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
243
-
257
+ float_numbers = [
258
+ float_numbers[0],
259
+ float_numbers[1],
260
+ float_numbers[0],
261
+ float_numbers[1],
262
+ ]
263
+
244
264
  action_inputs[param_name.strip()] = str(float_numbers)
245
265
  except ValueError as e:
246
266
  # If parsing fails, keep the original parameter value
247
267
  print(f"Warning: Could not parse coordinates '{param}': {e}")
248
268
  action_inputs[param_name.strip()] = param
249
-
250
- return [{
251
- "thought": thought,
252
- "action_type": action_type,
253
- "action_inputs": action_inputs,
254
- "text": text
255
- }]
269
+
270
+ return [
271
+ {
272
+ "thought": thought,
273
+ "action_type": action_type,
274
+ "action_inputs": action_inputs,
275
+ "text": text,
276
+ }
277
+ ]
256
278
 
257
279
 
258
- def convert_to_computer_actions(parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
280
+ def convert_to_computer_actions(
281
+ parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int
282
+ ) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
259
283
  """Convert parsed UITARS responses to computer actions."""
260
284
  computer_actions = []
261
-
285
+
262
286
  for response in parsed_responses:
263
287
  action_type = response.get("action_type")
264
288
  action_inputs = response.get("action_inputs", {})
265
-
289
+
266
290
  if action_type == "finished":
267
291
  finished_text = action_inputs.get("content", "Task completed successfully.")
268
292
  computer_actions.append(make_output_text_item(finished_text))
269
293
  break
270
-
294
+
271
295
  elif action_type == "wait":
272
296
  computer_actions.append(make_wait_item())
273
-
297
+
274
298
  elif action_type == "call_user":
275
- computer_actions.append(make_output_text_item("I need assistance from the user to proceed with this task."))
276
-
299
+ computer_actions.append(
300
+ make_output_text_item("I need assistance from the user to proceed with this task.")
301
+ )
302
+
277
303
  elif action_type in ["click", "left_single"]:
278
304
  start_box = action_inputs.get("start_box")
279
305
  if start_box:
280
306
  coords = eval(start_box)
281
307
  x = int((coords[0] + coords[2]) / 2 * image_width)
282
308
  y = int((coords[1] + coords[3]) / 2 * image_height)
283
-
309
+
284
310
  computer_actions.append(make_click_item(x, y, "left"))
285
-
311
+
286
312
  elif action_type == "double_click":
287
313
  start_box = action_inputs.get("start_box")
288
314
  if start_box:
289
315
  coords = eval(start_box)
290
316
  x = int((coords[0] + coords[2]) / 2 * image_width)
291
317
  y = int((coords[1] + coords[3]) / 2 * image_height)
292
-
318
+
293
319
  computer_actions.append(make_double_click_item(x, y))
294
-
320
+
295
321
  elif action_type == "right_click":
296
322
  start_box = action_inputs.get("start_box")
297
323
  if start_box:
298
324
  coords = eval(start_box)
299
325
  x = int((coords[0] + coords[2]) / 2 * image_width)
300
326
  y = int((coords[1] + coords[3]) / 2 * image_height)
301
-
327
+
302
328
  computer_actions.append(make_click_item(x, y, "right"))
303
-
329
+
304
330
  elif action_type == "type":
305
331
  content = action_inputs.get("content", "")
306
332
  computer_actions.append(make_type_item(content))
307
-
333
+
308
334
  elif action_type == "hotkey":
309
335
  key = action_inputs.get("key", "")
310
336
  keys = key.split()
311
337
  computer_actions.append(make_keypress_item(keys))
312
-
338
+
313
339
  elif action_type == "press":
314
340
  key = action_inputs.get("key", "")
315
341
  computer_actions.append(make_keypress_item([key]))
316
-
342
+
317
343
  elif action_type == "scroll":
318
344
  start_box = action_inputs.get("start_box")
319
345
  direction = action_inputs.get("direction", "down")
320
-
346
+
321
347
  if start_box:
322
348
  coords = eval(start_box)
323
349
  x = int((coords[0] + coords[2]) / 2 * image_width)
324
350
  y = int((coords[1] + coords[3]) / 2 * image_height)
325
351
  else:
326
352
  x, y = image_width // 2, image_height // 2
327
-
353
+
328
354
  scroll_y = 5 if "up" in direction.lower() else -5
329
355
  computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
330
-
356
+
331
357
  elif action_type == "drag":
332
358
  start_box = action_inputs.get("start_box")
333
359
  end_box = action_inputs.get("end_box")
334
-
360
+
335
361
  if start_box and end_box:
336
362
  start_coords = eval(start_box)
337
363
  end_coords = eval(end_box)
338
-
364
+
339
365
  start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
340
366
  start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
341
367
  end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
342
368
  end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
343
-
369
+
344
370
  path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
345
371
  computer_actions.append(make_drag_item(path))
346
-
372
+
347
373
  return computer_actions
348
374
 
349
375
 
@@ -354,33 +380,35 @@ def pil_to_base64(image: Image.Image) -> str:
354
380
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
355
381
 
356
382
 
357
- def process_image_for_uitars(image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS) -> tuple[Image.Image, int, int]:
383
+ def process_image_for_uitars(
384
+ image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS
385
+ ) -> tuple[Image.Image, int, int]:
358
386
  """Process image for UITARS model input."""
359
387
  # Decode base64 image
360
- if image_data.startswith('data:image'):
361
- image_data = image_data.split(',')[1]
362
-
388
+ if image_data.startswith("data:image"):
389
+ image_data = image_data.split(",")[1]
390
+
363
391
  image_bytes = base64.b64decode(image_data)
364
392
  image = Image.open(BytesIO(image_bytes))
365
-
393
+
366
394
  original_width, original_height = image.size
367
-
395
+
368
396
  # Resize image according to UITARS requirements
369
397
  if image.width * image.height > max_pixels:
370
398
  resize_factor = math.sqrt(max_pixels / (image.width * image.height))
371
399
  width = int(image.width * resize_factor)
372
400
  height = int(image.height * resize_factor)
373
401
  image = image.resize((width, height))
374
-
402
+
375
403
  if image.width * image.height < min_pixels:
376
404
  resize_factor = math.sqrt(min_pixels / (image.width * image.height))
377
405
  width = math.ceil(image.width * resize_factor)
378
406
  height = math.ceil(image.height * resize_factor)
379
407
  image = image.resize((width, height))
380
-
408
+
381
409
  if image.mode != "RGB":
382
410
  image = image.convert("RGB")
383
-
411
+
384
412
  return image, original_width, original_height
385
413
 
386
414
 
@@ -391,7 +419,11 @@ def sanitize_message(msg: Any) -> Any:
391
419
  for key, value in msg.items():
392
420
  if key == "content" and isinstance(value, list):
393
421
  result[key] = [
394
- {k: v for k, v in item.items() if k != "image_url"} if isinstance(item, dict) else item
422
+ (
423
+ {k: v for k, v in item.items() if k != "image_url"}
424
+ if isinstance(item, dict)
425
+ else item
426
+ )
395
427
  for item in value
396
428
  ]
397
429
  else:
@@ -406,38 +438,41 @@ def sanitize_message(msg: Any) -> Any:
406
438
  def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
407
439
  """
408
440
  Convert UITARS internal message format back to LiteLLM format.
409
-
441
+
410
442
  This function processes reasoning, computer_call, and computer_call_output messages
411
443
  and converts them to the appropriate LiteLLM assistant message format.
412
-
444
+
413
445
  Args:
414
446
  messages: List of UITARS internal messages
415
-
447
+
416
448
  Returns:
417
449
  List of LiteLLM formatted messages
418
450
  """
419
451
  litellm_messages = []
420
452
  current_assistant_content = []
421
-
453
+
422
454
  for message in messages:
423
455
  if isinstance(message, dict):
424
456
  message_type = message.get("type")
425
-
457
+
426
458
  if message_type == "reasoning":
427
459
  # Extract reasoning text from summary
428
460
  summary = message.get("summary", [])
429
461
  if summary and isinstance(summary, list):
430
462
  for summary_item in summary:
431
- if isinstance(summary_item, dict) and summary_item.get("type") == "summary_text":
463
+ if (
464
+ isinstance(summary_item, dict)
465
+ and summary_item.get("type") == "summary_text"
466
+ ):
432
467
  reasoning_text = summary_item.get("text", "")
433
468
  if reasoning_text:
434
469
  current_assistant_content.append(f"Thought: {reasoning_text}")
435
-
470
+
436
471
  elif message_type == "computer_call":
437
472
  # Convert computer action to UITARS action format
438
473
  action = message.get("action", {})
439
474
  action_type = action.get("type")
440
-
475
+
441
476
  if action_type == "click":
442
477
  x, y = action.get("x", 0), action.get("y", 0)
443
478
  button = action.get("button", "left")
@@ -447,59 +482,65 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
447
482
  action_text = f"Action: right_single(start_box='({x},{y})')"
448
483
  else:
449
484
  action_text = f"Action: click(start_box='({x},{y})')"
450
-
485
+
451
486
  elif action_type == "double_click":
452
487
  x, y = action.get("x", 0), action.get("y", 0)
453
488
  action_text = f"Action: left_double(start_box='({x},{y})')"
454
-
489
+
455
490
  elif action_type == "drag":
456
491
  start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
457
492
  end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
458
493
  action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
459
-
494
+
460
495
  elif action_type == "key":
461
496
  key = action.get("key", "")
462
497
  action_text = f"Action: hotkey(key='{key}')"
463
-
498
+
464
499
  elif action_type == "type":
465
500
  text = action.get("text", "")
466
501
  # Escape single quotes in the text
467
502
  escaped_text = escape_single_quotes(text)
468
503
  action_text = f"Action: type(content='{escaped_text}')"
469
-
504
+
470
505
  elif action_type == "scroll":
471
506
  x, y = action.get("x", 0), action.get("y", 0)
472
507
  direction = action.get("direction", "down")
473
508
  action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
474
-
509
+
475
510
  elif action_type == "wait":
476
511
  action_text = "Action: wait()"
477
-
512
+
478
513
  else:
479
514
  # Fallback for unknown action types
480
515
  action_text = f"Action: {action_type}({action})"
481
-
516
+
482
517
  current_assistant_content.append(action_text)
483
-
518
+
484
519
  # When we hit a computer_call_output, finalize the current assistant message
485
520
  if current_assistant_content:
486
- litellm_messages.append({
487
- "role": "assistant",
488
- "content": [{"type": "text", "text": "\n".join(current_assistant_content)}]
489
- })
521
+ litellm_messages.append(
522
+ {
523
+ "role": "assistant",
524
+ "content": [
525
+ {"type": "text", "text": "\n".join(current_assistant_content)}
526
+ ],
527
+ }
528
+ )
490
529
  current_assistant_content = []
491
-
530
+
492
531
  elif message_type == "computer_call_output":
493
532
  # Add screenshot from computer call output
494
533
  output = message.get("output", {})
495
534
  if isinstance(output, dict) and output.get("type") == "input_image":
496
535
  image_url = output.get("image_url", "")
497
536
  if image_url:
498
- litellm_messages.append({
499
- "role": "user",
500
- "content": [{"type": "image_url", "image_url": {"url": image_url}}]
501
- })
502
-
537
+ litellm_messages.append(
538
+ {
539
+ "role": "user",
540
+ "content": [{"type": "image_url", "image_url": {"url": image_url}}],
541
+ }
542
+ )
543
+
503
544
  elif message.get("role") == "user":
504
545
  # # Handle user messages
505
546
  # content = message.get("content", "")
@@ -514,24 +555,22 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
514
555
  # "content": content
515
556
  # })
516
557
  pass
517
-
558
+
518
559
  # Add any remaining assistant content
519
560
  if current_assistant_content:
520
- litellm_messages.append({
521
- "role": "assistant",
522
- "content": current_assistant_content
523
- })
524
-
561
+ litellm_messages.append({"role": "assistant", "content": current_assistant_content})
562
+
525
563
  return litellm_messages
526
564
 
565
+
527
566
  @register_agent(models=r"(?i).*ui-?tars.*")
528
567
  class UITARSConfig:
529
568
  """
530
569
  UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
531
-
570
+
532
571
  Supports UITARS vision-language models for computer control.
533
572
  """
534
-
573
+
535
574
  async def predict_step(
536
575
  self,
537
576
  messages: List[Dict[str, Any]],
@@ -545,11 +584,11 @@ class UITARSConfig:
545
584
  _on_api_end=None,
546
585
  _on_usage=None,
547
586
  _on_screenshot=None,
548
- **kwargs
587
+ **kwargs,
549
588
  ) -> Dict[str, Any]:
550
589
  """
551
590
  Predict the next step based on input messages.
552
-
591
+
553
592
  Args:
554
593
  messages: Input messages following Responses format
555
594
  model: Model name to use
@@ -562,22 +601,22 @@ class UITARSConfig:
562
601
  _on_usage: Callback for usage tracking
563
602
  _on_screenshot: Callback for screenshot events
564
603
  **kwargs: Additional arguments
565
-
604
+
566
605
  Returns:
567
606
  Dictionary with "output" (output items) and "usage" array
568
607
  """
569
608
  tools = tools or []
570
-
609
+
571
610
  # Create response items
572
611
  response_items = []
573
-
612
+
574
613
  # Find computer tool for screen dimensions
575
614
  computer_tool = None
576
615
  for tool_schema in tools:
577
616
  if tool_schema["type"] == "computer":
578
617
  computer_tool = tool_schema["computer"]
579
618
  break
580
-
619
+
581
620
  # Get screen dimensions
582
621
  screen_width, screen_height = 1024, 768
583
622
  if computer_tool:
@@ -585,20 +624,20 @@ class UITARSConfig:
585
624
  screen_width, screen_height = await computer_tool.get_dimensions()
586
625
  except:
587
626
  pass
588
-
627
+
589
628
  # Process messages to extract instruction and image
590
629
  instruction = ""
591
630
  image_data = None
592
-
631
+
593
632
  # Convert messages to list if string
594
633
  if isinstance(messages, str):
595
634
  messages = [{"role": "user", "content": messages}]
596
-
635
+
597
636
  # Extract instruction and latest screenshot
598
637
  for message in reversed(messages):
599
638
  if isinstance(message, dict):
600
639
  content = message.get("content", "")
601
-
640
+
602
641
  # Handle different content formats
603
642
  if isinstance(content, str):
604
643
  if not instruction and message.get("role") == "user":
@@ -614,46 +653,41 @@ class UITARSConfig:
614
653
  image_data = image_url.get("url", "")
615
654
  else:
616
655
  image_data = image_url
617
-
656
+
618
657
  # Also check for computer_call_output with screenshots
619
658
  if message.get("type") == "computer_call_output" and not image_data:
620
659
  output = message.get("output", {})
621
660
  if isinstance(output, dict) and output.get("type") == "input_image":
622
661
  image_data = output.get("image_url", "")
623
-
662
+
624
663
  if instruction and image_data:
625
664
  break
626
-
665
+
627
666
  if not instruction:
628
- instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
629
-
667
+ instruction = (
668
+ "Help me complete this task by analyzing the screen and taking appropriate actions."
669
+ )
670
+
630
671
  # Create prompt
631
672
  user_prompt = UITARS_PROMPT_TEMPLATE.format(
632
- instruction=instruction,
633
- action_space=UITARS_ACTION_SPACE,
634
- language="English"
673
+ instruction=instruction, action_space=UITARS_ACTION_SPACE, language="English"
635
674
  )
636
-
675
+
637
676
  # Convert conversation history to LiteLLM format
638
677
  history_messages = convert_uitars_messages_to_litellm(messages)
639
-
678
+
640
679
  # Prepare messages for liteLLM
641
- litellm_messages = [
642
- {
643
- "role": "system",
644
- "content": "You are a helpful assistant."
645
- }
646
- ]
680
+ litellm_messages = [{"role": "system", "content": "You are a helpful assistant."}]
647
681
 
648
682
  # Add current user instruction with screenshot
649
683
  current_user_message = {
650
- "role": "user",
684
+ "role": "user",
651
685
  "content": [
652
686
  {"type": "text", "text": user_prompt},
653
- ]
687
+ ],
654
688
  }
655
689
  litellm_messages.append(current_user_message)
656
-
690
+
657
691
  # Process image for UITARS
658
692
  if not image_data:
659
693
  # Take screenshot if none found in messages
@@ -667,17 +701,22 @@ class UITARSConfig:
667
701
  raise ValueError("No screenshot found in messages and no computer_handler provided")
668
702
  processed_image, original_width, original_height = process_image_for_uitars(image_data)
669
703
  encoded_image = pil_to_base64(processed_image)
670
-
704
+
671
705
  # Add conversation history
672
706
  if history_messages:
673
707
  litellm_messages.extend(history_messages)
674
708
  else:
675
- litellm_messages.append({
676
- "role": "user",
677
- "content": [
678
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
679
- ]
680
- })
709
+ litellm_messages.append(
710
+ {
711
+ "role": "user",
712
+ "content": [
713
+ {
714
+ "type": "image_url",
715
+ "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
716
+ }
717
+ ],
718
+ }
719
+ )
681
720
 
682
721
  # Prepare API call kwargs
683
722
  api_kwargs = {
@@ -687,146 +726,142 @@ class UITARSConfig:
687
726
  "temperature": kwargs.get("temperature", 0.0),
688
727
  "do_sample": kwargs.get("temperature", 0.0) > 0.0,
689
728
  "num_retries": max_retries,
690
- **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
729
+ **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
691
730
  }
692
-
731
+
693
732
  # Call API start hook
694
733
  if _on_api_start:
695
734
  await _on_api_start(api_kwargs)
696
-
735
+
697
736
  # Call liteLLM with UITARS model
698
737
  response = await litellm.acompletion(**api_kwargs)
699
-
738
+
700
739
  # Call API end hook
701
740
  if _on_api_end:
702
741
  await _on_api_end(api_kwargs, response)
703
-
742
+
704
743
  # Extract response content
705
- response_content = response.choices[0].message.content.strip() # type: ignore
706
-
744
+ response_content = response.choices[0].message.content.strip() # type: ignore
745
+
707
746
  # Parse UITARS response
708
747
  parsed_responses = parse_uitars_response(response_content, original_width, original_height)
709
-
748
+
710
749
  # Convert to computer actions
711
- computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
712
-
750
+ computer_actions = convert_to_computer_actions(
751
+ parsed_responses, original_width, original_height
752
+ )
753
+
713
754
  # Add computer actions to response items
714
755
  thought = parsed_responses[0].get("thought", "")
715
756
  if thought:
716
757
  response_items.append(make_reasoning_item(thought))
717
758
  response_items.extend(computer_actions)
718
-
759
+
719
760
  # Extract usage information
720
761
  response_usage = {
721
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
762
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(
763
+ response.usage
764
+ ).model_dump(),
722
765
  "response_cost": response._hidden_params.get("response_cost", 0.0),
723
766
  }
724
767
  if _on_usage:
725
768
  await _on_usage(response_usage)
726
769
 
727
770
  # Create agent response
728
- agent_response = {
729
- "output": response_items,
730
- "usage": response_usage
731
- }
732
-
771
+ agent_response = {"output": response_items, "usage": response_usage}
772
+
733
773
  return agent_response
734
-
774
+
735
775
  async def predict_click(
736
- self,
737
- model: str,
738
- image_b64: str,
739
- instruction: str
776
+ self, model: str, image_b64: str, instruction: str
740
777
  ) -> Optional[Tuple[int, int]]:
741
778
  """
742
779
  Predict click coordinates based on image and instruction.
743
-
780
+
744
781
  UITARS supports click prediction through its action parsing.
745
-
782
+
746
783
  Args:
747
784
  model: Model name to use
748
785
  image_b64: Base64 encoded image
749
786
  instruction: Instruction for where to click
750
-
787
+
751
788
  Returns:
752
789
  Tuple with (x, y) coordinates or None
753
790
  """
754
791
  try:
755
792
  # Create prompt using grounding template
756
- user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
757
- instruction=instruction
758
- )
759
-
793
+ user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(instruction=instruction)
794
+
760
795
  # Process image for UITARS
761
796
  processed_image, original_width, original_height = process_image_for_uitars(image_b64)
762
797
  encoded_image = pil_to_base64(processed_image)
763
-
798
+
764
799
  # Prepare messages for liteLLM
765
800
  litellm_messages = [
766
- {
767
- "role": "system",
768
- "content": "You are a helpful assistant."
769
- },
801
+ {"role": "system", "content": "You are a helpful assistant."},
770
802
  {
771
803
  "role": "user",
772
804
  "content": [
773
805
  {"type": "text", "text": user_prompt},
774
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
775
- ]
776
- }
806
+ {
807
+ "type": "image_url",
808
+ "image_url": {"url": f"data:image/png;base64,{encoded_image}"},
809
+ },
810
+ ],
811
+ },
777
812
  ]
778
-
813
+
779
814
  # Prepare API call kwargs
780
815
  api_kwargs = {
781
816
  "model": model,
782
817
  "messages": litellm_messages,
783
818
  "max_tokens": 2056,
784
819
  "temperature": 0.0,
785
- "do_sample": False
820
+ "do_sample": False,
786
821
  }
787
-
822
+
788
823
  # Call liteLLM with UITARS model
789
824
  response = await litellm.acompletion(**api_kwargs)
790
-
825
+
791
826
  # Extract response content
792
- response_content = response.choices[0].message.content.strip() # type: ignore
793
-
827
+ response_content = response.choices[0].message.content.strip() # type: ignore
828
+
794
829
  print(response_content)
795
830
 
796
831
  # Parse the response to extract click coordinates
797
832
  # Look for click action with coordinates (with special tokens)
798
833
  click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
799
834
  match = re.search(click_pattern, response_content)
800
-
835
+
801
836
  # Fallback: Look for simpler format without special tokens
802
837
  if not match:
803
838
  # Pattern for: click(start_box='(x,y)') or click(point='(x,y)')
804
839
  fallback_pattern = r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)"
805
840
  match = re.search(fallback_pattern, response_content)
806
-
841
+
807
842
  if match:
808
843
  x, y = int(match.group(1)), int(match.group(2))
809
844
  # Scale coordinates back to original image dimensions
810
845
  scale_x = original_width / processed_image.width
811
846
  scale_y = original_height / processed_image.height
812
-
847
+
813
848
  scaled_x = int(x * scale_x)
814
849
  scaled_y = int(y * scale_y)
815
-
850
+
816
851
  return (scaled_x, scaled_y)
817
-
852
+
818
853
  return None
819
-
854
+
820
855
  except Exception as e:
821
856
  # Log error and return None
822
857
  print(f"Error in predict_click: {e}")
823
858
  return None
824
-
859
+
825
860
  def get_capabilities(self) -> List[AgentCapability]:
826
861
  """
827
862
  Get list of capabilities supported by this agent config.
828
-
863
+
829
864
  Returns:
830
865
  List of capability strings
831
866
  """
832
- return ["step", "click"]
867
+ return ["step", "click"]