cua-agent 0.3.2__py3-none-any.whl → 0.4.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +229 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b2.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b2.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b2.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py ADDED
@@ -0,0 +1,688 @@
1
+ """
2
+ UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
3
+ """
4
+
5
+ import asyncio
6
+ from ctypes import cast
7
+ import json
8
+ import base64
9
+ import math
10
+ import re
11
+ import ast
12
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional
13
+ from io import BytesIO
14
+ from PIL import Image
15
+ import litellm
16
+ from litellm.types.utils import ModelResponse
17
+ from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
18
+ from litellm.responses.utils import Usage
19
+ from openai.types.responses.response_computer_tool_call_param import ActionType, ResponseComputerToolCallParam
20
+ from openai.types.responses.response_input_param import ComputerCallOutput
21
+ from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
22
+ from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
23
+
24
+ from ..decorators import agent_loop
25
+ from ..types import Messages, AgentResponse, Tools
26
+ from ..responses import (
27
+ make_reasoning_item,
28
+ make_output_text_item,
29
+ make_click_item,
30
+ make_double_click_item,
31
+ make_drag_item,
32
+ make_keypress_item,
33
+ make_scroll_item,
34
+ make_type_item,
35
+ make_wait_item,
36
+ make_input_image_item
37
+ )
38
+
39
+ # Constants from reference code
40
+ IMAGE_FACTOR = 28
41
+ MIN_PIXELS = 100 * 28 * 28
42
+ MAX_PIXELS = 16384 * 28 * 28
43
+ MAX_RATIO = 200
44
+
45
+ FINISH_WORD = "finished"
46
+ WAIT_WORD = "wait"
47
+ ENV_FAIL_WORD = "error_env"
48
+ CALL_USER = "call_user"
49
+
50
+ # Action space prompt for UITARS
51
+ UITARS_ACTION_SPACE = """
52
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
53
+ left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
54
+ right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
55
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
56
+ hotkey(key='')
57
+ type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
58
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
59
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
60
+ finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
61
+ """
62
+
63
+ UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
64
+
65
+ ## Output Format
66
+ ```
67
+ Thought: ...
68
+ Action: ...
69
+ ```
70
+
71
+ ## Action Space
72
+ {action_space}
73
+
74
+ ## Note
75
+ - Use {language} in `Thought` part.
76
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
77
+
78
+ ## User Instruction
79
+ {instruction}
80
+ """
81
+
82
+
83
+ def round_by_factor(number: float, factor: int) -> int:
84
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
85
+ return round(number / factor) * factor
86
+
87
+
88
+ def ceil_by_factor(number: float, factor: int) -> int:
89
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
90
+ return math.ceil(number / factor) * factor
91
+
92
+
93
+ def floor_by_factor(number: float, factor: int) -> int:
94
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
95
+ return math.floor(number / factor) * factor
96
+
97
+
98
+ def smart_resize(
99
+ height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
100
+ ) -> tuple[int, int]:
101
+ """
102
+ Rescales the image so that the following conditions are met:
103
+ 1. Both dimensions (height and width) are divisible by 'factor'.
104
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
105
+ 3. The aspect ratio of the image is maintained as closely as possible.
106
+ """
107
+ if max(height, width) / min(height, width) > MAX_RATIO:
108
+ raise ValueError(
109
+ f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
110
+ )
111
+ h_bar = max(factor, round_by_factor(height, factor))
112
+ w_bar = max(factor, round_by_factor(width, factor))
113
+ if h_bar * w_bar > max_pixels:
114
+ beta = math.sqrt((height * width) / max_pixels)
115
+ h_bar = floor_by_factor(height / beta, factor)
116
+ w_bar = floor_by_factor(width / beta, factor)
117
+ elif h_bar * w_bar < min_pixels:
118
+ beta = math.sqrt(min_pixels / (height * width))
119
+ h_bar = ceil_by_factor(height * beta, factor)
120
+ w_bar = ceil_by_factor(width * beta, factor)
121
+ return h_bar, w_bar
122
+
123
+
124
+ def escape_single_quotes(text):
125
+ """Escape single quotes in text for safe string formatting."""
126
+ pattern = r"(?<!\\)'"
127
+ return re.sub(pattern, r"\\'", text)
128
+
129
+
130
+ def parse_action(action_str):
131
+ """Parse action string into structured format."""
132
+ try:
133
+ node = ast.parse(action_str, mode='eval')
134
+ if not isinstance(node, ast.Expression):
135
+ raise ValueError("Not an expression")
136
+
137
+ call = node.body
138
+ if not isinstance(call, ast.Call):
139
+ raise ValueError("Not a function call")
140
+
141
+ # Get function name
142
+ if isinstance(call.func, ast.Name):
143
+ func_name = call.func.id
144
+ elif isinstance(call.func, ast.Attribute):
145
+ func_name = call.func.attr
146
+ else:
147
+ func_name = None
148
+
149
+ # Get keyword arguments
150
+ kwargs = {}
151
+ for kw in call.keywords:
152
+ key = kw.arg
153
+ if isinstance(kw.value, ast.Constant):
154
+ value = kw.value.value
155
+ elif isinstance(kw.value, ast.Str): # Compatibility with older Python
156
+ value = kw.value.s
157
+ else:
158
+ value = None
159
+ kwargs[key] = value
160
+
161
+ return {
162
+ 'function': func_name,
163
+ 'args': kwargs
164
+ }
165
+
166
+ except Exception as e:
167
+ print(f"Failed to parse action '{action_str}': {e}")
168
+ return None
169
+
170
+
171
+ def parse_uitars_response(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
172
+ """Parse UITARS model response into structured actions."""
173
+ text = text.strip()
174
+
175
+ # Extract thought
176
+ thought = None
177
+ if text.startswith("Thought:"):
178
+ thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
179
+ if thought_match:
180
+ thought = thought_match.group(1).strip()
181
+
182
+ # Extract action
183
+ if "Action:" not in text:
184
+ raise ValueError("No Action found in response")
185
+
186
+ action_str = text.split("Action:")[-1].strip()
187
+
188
+ # Handle special case for type actions
189
+ if "type(content" in action_str:
190
+ def escape_quotes(match):
191
+ return match.group(1)
192
+
193
+ pattern = r"type\(content='(.*?)'\)"
194
+ content = re.sub(pattern, escape_quotes, action_str)
195
+ action_str = escape_single_quotes(content)
196
+ action_str = "type(content='" + action_str + "')"
197
+
198
+
199
+ # Parse the action
200
+ parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
201
+ if parsed_action is None:
202
+ raise ValueError(f"Action can't parse: {action_str}")
203
+
204
+ action_type = parsed_action["function"]
205
+ params = parsed_action["args"]
206
+
207
+ # Process parameters
208
+ action_inputs = {}
209
+ for param_name, param in params.items():
210
+ if param == "":
211
+ continue
212
+ param = str(param).lstrip()
213
+ action_inputs[param_name.strip()] = param
214
+
215
+ # Handle coordinate parameters
216
+ if "start_box" in param_name or "end_box" in param_name:
217
+ # Parse coordinates like '(x,y)' or '(x1,y1,x2,y2)'
218
+ numbers = param.replace("(", "").replace(")", "").split(",")
219
+ float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
220
+
221
+ if len(float_numbers) == 2:
222
+ # Single point, duplicate for box format
223
+ float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
224
+
225
+ action_inputs[param_name.strip()] = str(float_numbers)
226
+
227
+ return [{
228
+ "thought": thought,
229
+ "action_type": action_type,
230
+ "action_inputs": action_inputs,
231
+ "text": text
232
+ }]
233
+
234
+
235
+ def convert_to_computer_actions(parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
236
+ """Convert parsed UITARS responses to computer actions."""
237
+ computer_actions = []
238
+
239
+ for response in parsed_responses:
240
+ action_type = response.get("action_type")
241
+ action_inputs = response.get("action_inputs", {})
242
+
243
+ if action_type == "finished":
244
+ finished_text = action_inputs.get("content", "Task completed successfully.")
245
+ computer_actions.append(make_output_text_item(finished_text))
246
+ break
247
+
248
+ elif action_type == "wait":
249
+ computer_actions.append(make_wait_item())
250
+
251
+ elif action_type == "call_user":
252
+ computer_actions.append(make_output_text_item("I need assistance from the user to proceed with this task."))
253
+
254
+ elif action_type in ["click", "left_single"]:
255
+ start_box = action_inputs.get("start_box")
256
+ if start_box:
257
+ coords = eval(start_box)
258
+ x = int((coords[0] + coords[2]) / 2 * image_width)
259
+ y = int((coords[1] + coords[3]) / 2 * image_height)
260
+
261
+ computer_actions.append(make_click_item(x, y, "left"))
262
+
263
+ elif action_type == "double_click":
264
+ start_box = action_inputs.get("start_box")
265
+ if start_box:
266
+ coords = eval(start_box)
267
+ x = int((coords[0] + coords[2]) / 2 * image_width)
268
+ y = int((coords[1] + coords[3]) / 2 * image_height)
269
+
270
+ computer_actions.append(make_double_click_item(x, y))
271
+
272
+ elif action_type == "right_click":
273
+ start_box = action_inputs.get("start_box")
274
+ if start_box:
275
+ coords = eval(start_box)
276
+ x = int((coords[0] + coords[2]) / 2 * image_width)
277
+ y = int((coords[1] + coords[3]) / 2 * image_height)
278
+
279
+ computer_actions.append(make_click_item(x, y, "right"))
280
+
281
+ elif action_type == "type":
282
+ content = action_inputs.get("content", "")
283
+ computer_actions.append(make_type_item(content))
284
+
285
+ elif action_type == "hotkey":
286
+ key = action_inputs.get("key", "")
287
+ keys = key.split()
288
+ computer_actions.append(make_keypress_item(keys))
289
+
290
+ elif action_type == "press":
291
+ key = action_inputs.get("key", "")
292
+ computer_actions.append(make_keypress_item([key]))
293
+
294
+ elif action_type == "scroll":
295
+ start_box = action_inputs.get("start_box")
296
+ direction = action_inputs.get("direction", "down")
297
+
298
+ if start_box:
299
+ coords = eval(start_box)
300
+ x = int((coords[0] + coords[2]) / 2 * image_width)
301
+ y = int((coords[1] + coords[3]) / 2 * image_height)
302
+ else:
303
+ x, y = image_width // 2, image_height // 2
304
+
305
+ scroll_y = 5 if "up" in direction.lower() else -5
306
+ computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
307
+
308
+ elif action_type == "drag":
309
+ start_box = action_inputs.get("start_box")
310
+ end_box = action_inputs.get("end_box")
311
+
312
+ if start_box and end_box:
313
+ start_coords = eval(start_box)
314
+ end_coords = eval(end_box)
315
+
316
+ start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
317
+ start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
318
+ end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
319
+ end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
320
+
321
+ path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
322
+ computer_actions.append(make_drag_item(path))
323
+
324
+ return computer_actions
325
+
326
+
327
+ def pil_to_base64(image: Image.Image) -> str:
328
+ """Convert PIL image to base64 string."""
329
+ buffer = BytesIO()
330
+ image.save(buffer, format="PNG")
331
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
332
+
333
+
334
+ def process_image_for_uitars(image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS) -> tuple[Image.Image, int, int]:
335
+ """Process image for UITARS model input."""
336
+ # Decode base64 image
337
+ if image_data.startswith('data:image'):
338
+ image_data = image_data.split(',')[1]
339
+
340
+ image_bytes = base64.b64decode(image_data)
341
+ image = Image.open(BytesIO(image_bytes))
342
+
343
+ original_width, original_height = image.size
344
+
345
+ # Resize image according to UITARS requirements
346
+ if image.width * image.height > max_pixels:
347
+ resize_factor = math.sqrt(max_pixels / (image.width * image.height))
348
+ width = int(image.width * resize_factor)
349
+ height = int(image.height * resize_factor)
350
+ image = image.resize((width, height))
351
+
352
+ if image.width * image.height < min_pixels:
353
+ resize_factor = math.sqrt(min_pixels / (image.width * image.height))
354
+ width = math.ceil(image.width * resize_factor)
355
+ height = math.ceil(image.height * resize_factor)
356
+ image = image.resize((width, height))
357
+
358
+ if image.mode != "RGB":
359
+ image = image.convert("RGB")
360
+
361
+ return image, original_width, original_height
362
+
363
+
364
+ def sanitize_message(msg: Any) -> Any:
365
+ """Return a copy of the message with image_url ommited within content parts"""
366
+ if isinstance(msg, dict):
367
+ result = {}
368
+ for key, value in msg.items():
369
+ if key == "content" and isinstance(value, list):
370
+ result[key] = [
371
+ {k: v for k, v in item.items() if k != "image_url"} if isinstance(item, dict) else item
372
+ for item in value
373
+ ]
374
+ else:
375
+ result[key] = value
376
+ return result
377
+ elif isinstance(msg, list):
378
+ return [sanitize_message(item) for item in msg]
379
+ else:
380
+ return msg
381
+
382
+
383
+ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
384
+ """
385
+ Convert UITARS internal message format back to LiteLLM format.
386
+
387
+ This function processes reasoning, computer_call, and computer_call_output messages
388
+ and converts them to the appropriate LiteLLM assistant message format.
389
+
390
+ Args:
391
+ messages: List of UITARS internal messages
392
+
393
+ Returns:
394
+ List of LiteLLM formatted messages
395
+ """
396
+ litellm_messages = []
397
+ current_assistant_content = []
398
+
399
+ for message in messages:
400
+ if isinstance(message, dict):
401
+ message_type = message.get("type")
402
+
403
+ if message_type == "reasoning":
404
+ # Extract reasoning text from summary
405
+ summary = message.get("summary", [])
406
+ if summary and isinstance(summary, list):
407
+ for summary_item in summary:
408
+ if isinstance(summary_item, dict) and summary_item.get("type") == "summary_text":
409
+ reasoning_text = summary_item.get("text", "")
410
+ if reasoning_text:
411
+ current_assistant_content.append(f"Thought: {reasoning_text}")
412
+
413
+ elif message_type == "computer_call":
414
+ # Convert computer action to UITARS action format
415
+ action = message.get("action", {})
416
+ action_type = action.get("type")
417
+
418
+ if action_type == "click":
419
+ x, y = action.get("x", 0), action.get("y", 0)
420
+ button = action.get("button", "left")
421
+ if button == "left":
422
+ action_text = f"Action: click(start_box='({x},{y})')"
423
+ elif button == "right":
424
+ action_text = f"Action: right_single(start_box='({x},{y})')"
425
+ else:
426
+ action_text = f"Action: click(start_box='({x},{y})')"
427
+
428
+ elif action_type == "double_click":
429
+ x, y = action.get("x", 0), action.get("y", 0)
430
+ action_text = f"Action: left_double(start_box='({x},{y})')"
431
+
432
+ elif action_type == "drag":
433
+ start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
434
+ end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
435
+ action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
436
+
437
+ elif action_type == "key":
438
+ key = action.get("key", "")
439
+ action_text = f"Action: hotkey(key='{key}')"
440
+
441
+ elif action_type == "type":
442
+ text = action.get("text", "")
443
+ # Escape single quotes in the text
444
+ escaped_text = escape_single_quotes(text)
445
+ action_text = f"Action: type(content='{escaped_text}')"
446
+
447
+ elif action_type == "scroll":
448
+ x, y = action.get("x", 0), action.get("y", 0)
449
+ direction = action.get("direction", "down")
450
+ action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
451
+
452
+ elif action_type == "wait":
453
+ action_text = "Action: wait()"
454
+
455
+ else:
456
+ # Fallback for unknown action types
457
+ action_text = f"Action: {action_type}({action})"
458
+
459
+ current_assistant_content.append(action_text)
460
+
461
+ # When we hit a computer_call_output, finalize the current assistant message
462
+ if current_assistant_content:
463
+ litellm_messages.append({
464
+ "role": "assistant",
465
+ "content": [{"type": "text", "text": "\n".join(current_assistant_content)}]
466
+ })
467
+ current_assistant_content = []
468
+
469
+ elif message_type == "computer_call_output":
470
+ # Add screenshot from computer call output
471
+ output = message.get("output", {})
472
+ if isinstance(output, dict) and output.get("type") == "input_image":
473
+ image_url = output.get("image_url", "")
474
+ if image_url:
475
+ litellm_messages.append({
476
+ "role": "user",
477
+ "content": [{"type": "image_url", "image_url": {"url": image_url}}]
478
+ })
479
+
480
+ elif message.get("role") == "user":
481
+ # # Handle user messages
482
+ # content = message.get("content", "")
483
+ # if isinstance(content, str):
484
+ # litellm_messages.append({
485
+ # "role": "user",
486
+ # "content": content
487
+ # })
488
+ # elif isinstance(content, list):
489
+ # litellm_messages.append({
490
+ # "role": "user",
491
+ # "content": content
492
+ # })
493
+ pass
494
+
495
+ # Add any remaining assistant content
496
+ if current_assistant_content:
497
+ litellm_messages.append({
498
+ "role": "assistant",
499
+ "content": current_assistant_content
500
+ })
501
+
502
+ return litellm_messages
503
+
504
+ @agent_loop(models=r"(?i).*ui-?tars.*", priority=10)
505
+ async def uitars_loop(
506
+ messages: Messages,
507
+ model: str,
508
+ tools: Optional[List[Dict[str, Any]]] = None,
509
+ max_retries: Optional[int] = None,
510
+ stream: bool = False,
511
+ computer_handler=None,
512
+ use_prompt_caching: Optional[bool] = False,
513
+ _on_api_start=None,
514
+ _on_api_end=None,
515
+ _on_usage=None,
516
+ _on_screenshot=None,
517
+ **kwargs
518
+ ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
519
+ """
520
+ UITARS agent loop using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
521
+
522
+ Supports UITARS vision-language models for computer control.
523
+ """
524
+ tools = tools or []
525
+
526
+ # Create response items
527
+ response_items = []
528
+
529
+ # Find computer tool for screen dimensions
530
+ computer_tool = None
531
+ for tool_schema in tools:
532
+ if tool_schema["type"] == "computer":
533
+ computer_tool = tool_schema["computer"]
534
+ break
535
+
536
+ # Get screen dimensions
537
+ screen_width, screen_height = 1024, 768
538
+ if computer_tool:
539
+ try:
540
+ screen_width, screen_height = await computer_tool.get_dimensions()
541
+ except:
542
+ pass
543
+
544
+ # Process messages to extract instruction and image
545
+ instruction = ""
546
+ image_data = None
547
+
548
+ # Convert messages to list if string
549
+ if isinstance(messages, str):
550
+ messages = [{"role": "user", "content": messages}]
551
+
552
+ # Extract instruction and latest screenshot
553
+ for message in reversed(messages):
554
+ if isinstance(message, dict):
555
+ content = message.get("content", "")
556
+
557
+ # Handle different content formats
558
+ if isinstance(content, str):
559
+ if not instruction and message.get("role") == "user":
560
+ instruction = content
561
+ elif isinstance(content, list):
562
+ for item in content:
563
+ if isinstance(item, dict):
564
+ if item.get("type") == "text" and not instruction:
565
+ instruction = item.get("text", "")
566
+ elif item.get("type") == "image_url" and not image_data:
567
+ image_url = item.get("image_url", {})
568
+ if isinstance(image_url, dict):
569
+ image_data = image_url.get("url", "")
570
+ else:
571
+ image_data = image_url
572
+
573
+ # Also check for computer_call_output with screenshots
574
+ if message.get("type") == "computer_call_output" and not image_data:
575
+ output = message.get("output", {})
576
+ if isinstance(output, dict) and output.get("type") == "input_image":
577
+ image_data = output.get("image_url", "")
578
+
579
+ if instruction and image_data:
580
+ break
581
+
582
+ if not instruction:
583
+ instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
584
+
585
+ # Create prompt
586
+ user_prompt = UITARS_PROMPT_TEMPLATE.format(
587
+ instruction=instruction,
588
+ action_space=UITARS_ACTION_SPACE,
589
+ language="English"
590
+ )
591
+
592
+ # Convert conversation history to LiteLLM format
593
+ history_messages = convert_uitars_messages_to_litellm(messages)
594
+
595
+ # Prepare messages for liteLLM
596
+ litellm_messages = [
597
+ {
598
+ "role": "system",
599
+ "content": "You are a helpful assistant."
600
+ }
601
+ ]
602
+
603
+ # Add current user instruction with screenshot
604
+ current_user_message = {
605
+ "role": "user",
606
+ "content": [
607
+ {"type": "text", "text": user_prompt},
608
+ ]
609
+ }
610
+ litellm_messages.append(current_user_message)
611
+
612
+ # Process image for UITARS
613
+ if not image_data:
614
+ # Take screenshot if none found in messages
615
+ if computer_handler:
616
+ image_data = await computer_handler.screenshot()
617
+ await _on_screenshot(image_data, "screenshot_before")
618
+
619
+ # Add screenshot to output items so it can be retained in history
620
+ response_items.append(make_input_image_item(image_data))
621
+ else:
622
+ raise ValueError("No screenshot found in messages and no computer_handler provided")
623
+ processed_image, original_width, original_height = process_image_for_uitars(image_data)
624
+ encoded_image = pil_to_base64(processed_image)
625
+
626
+ # Add conversation history
627
+ if history_messages:
628
+ litellm_messages.extend(history_messages)
629
+ else:
630
+ litellm_messages.append({
631
+ "role": "user",
632
+ "content": [
633
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
634
+ ]
635
+ })
636
+
637
+ # Prepare API call kwargs
638
+ api_kwargs = {
639
+ "model": model,
640
+ "messages": litellm_messages,
641
+ "max_tokens": kwargs.get("max_tokens", 500),
642
+ "temperature": kwargs.get("temperature", 0.0),
643
+ "do_sample": kwargs.get("temperature", 0.0) > 0.0,
644
+ "num_retries": max_retries,
645
+ **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
646
+ }
647
+
648
+ # Call API start hook
649
+ if _on_api_start:
650
+ await _on_api_start(api_kwargs)
651
+
652
+ # Call liteLLM with UITARS model
653
+ response = await litellm.acompletion(**api_kwargs)
654
+
655
+ # Call API end hook
656
+ if _on_api_end:
657
+ await _on_api_end(api_kwargs, response)
658
+
659
+ # Extract response content
660
+ response_content = response.choices[0].message.content.strip() # type: ignore
661
+
662
+ # Parse UITARS response
663
+ parsed_responses = parse_uitars_response(response_content, original_width, original_height)
664
+
665
+ # Convert to computer actions
666
+ computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
667
+
668
+ # Add computer actions to response items
669
+ thought = parsed_responses[0].get("thought", "")
670
+ if thought:
671
+ response_items.append(make_reasoning_item(thought))
672
+ response_items.extend(computer_actions)
673
+
674
+ # Extract usage information
675
+ response_usage = {
676
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
677
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
678
+ }
679
+ if _on_usage:
680
+ await _on_usage(response_usage)
681
+
682
+ # Create agent response
683
+ agent_response = {
684
+ "output": response_items,
685
+ "usage": response_usage
686
+ }
687
+
688
+ return agent_response