cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/uitars2.py ADDED
@@ -0,0 +1,951 @@
1
+ """
2
+ UITARS-2 agent loop implementation using LiteLLM.
3
+ - Prepends a system prompt modeled after the training prompts in examples/seed_16_gui.ipynb
4
+ - Converts Responses items -> completion messages
5
+ - Calls litellm.acompletion
6
+ - Parses <seed:tool_call> ... </seed:tool_call> outputs back into Responses items (computer actions)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import base64
12
+ import io
13
+ import json
14
+ import re
15
+ from typing import Any, Dict, List, Optional, Tuple
16
+
17
+ import litellm
18
+ from litellm.responses.litellm_completion_transformation.transformation import (
19
+ LiteLLMCompletionResponsesConfig,
20
+ )
21
+
22
+ from ..decorators import register_agent
23
+ from .omniparser import get_last_computer_call_output # type: ignore
24
+
25
+ try:
26
+ from PIL import Image # type: ignore
27
+ except Exception: # pragma: no cover
28
+ Image = None # type: ignore
29
+ from ..responses import (
30
+ convert_responses_items_to_completion_messages,
31
+ make_click_item,
32
+ make_double_click_item,
33
+ make_drag_item,
34
+ make_function_call_item,
35
+ make_keypress_item,
36
+ make_move_item,
37
+ make_output_text_item,
38
+ make_reasoning_item,
39
+ make_screenshot_item,
40
+ make_scroll_item,
41
+ make_type_item,
42
+ make_wait_item,
43
+ )
44
+ from ..types import AgentCapability
45
+
46
+ TOOL_SCHEMAS: List[Dict[str, Any]] = [
47
+ {
48
+ "type": "function",
49
+ "name": "open_computer",
50
+ "parameters": {},
51
+ "description": "Open computer.",
52
+ },
53
+ {
54
+ "type": "function",
55
+ "name": "click",
56
+ "parameters": {
57
+ "type": "object",
58
+ "properties": {
59
+ "point": {
60
+ "type": "string",
61
+ "description": "Click coordinates. The format is: <point>x y</point>",
62
+ }
63
+ },
64
+ "required": ["point"],
65
+ },
66
+ "description": "Mouse left single click action.",
67
+ },
68
+ {
69
+ "type": "function",
70
+ "name": "left_double",
71
+ "parameters": {
72
+ "type": "object",
73
+ "properties": {
74
+ "point": {
75
+ "type": "string",
76
+ "description": "Click coordinates. The format is: <point>x y</point>",
77
+ }
78
+ },
79
+ "required": ["point"],
80
+ },
81
+ "description": "Mouse left double click action.",
82
+ },
83
+ {
84
+ "type": "function",
85
+ "name": "right_single",
86
+ "parameters": {
87
+ "type": "object",
88
+ "properties": {
89
+ "point": {
90
+ "type": "string",
91
+ "description": "Click coordinates. The format is: <point>x y</point>",
92
+ }
93
+ },
94
+ "required": ["point"],
95
+ },
96
+ "description": "Mouse right single click action.",
97
+ },
98
+ {
99
+ "type": "function",
100
+ "name": "scroll",
101
+ "parameters": {
102
+ "type": "object",
103
+ "properties": {
104
+ "point": {
105
+ "type": "string",
106
+ "description": "Scroll start position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>",
107
+ },
108
+ "direction": {
109
+ "type": "string",
110
+ "description": "Scroll direction.",
111
+ "enum": ["up", "down", "left", "right"],
112
+ },
113
+ },
114
+ "required": ["direction"],
115
+ },
116
+ "description": "Scroll action.",
117
+ },
118
+ {
119
+ "type": "function",
120
+ "name": "move_to",
121
+ "parameters": {
122
+ "type": "object",
123
+ "properties": {
124
+ "point": {
125
+ "type": "string",
126
+ "description": "Target coordinates. The format is: <point>x y</point>",
127
+ }
128
+ },
129
+ "required": ["point"],
130
+ },
131
+ "description": "Mouse move action.",
132
+ },
133
+ {
134
+ "type": "function",
135
+ "name": "hotkey",
136
+ "parameters": {
137
+ "type": "object",
138
+ "properties": {
139
+ "key": {
140
+ "type": "string",
141
+ "description": "Hotkeys you want to press. Split keys with a space and use lowercase.",
142
+ }
143
+ },
144
+ "required": ["key"],
145
+ },
146
+ "description": "Press hotkey.",
147
+ },
148
+ {
149
+ "type": "function",
150
+ "name": "finished",
151
+ "parameters": {
152
+ "type": "object",
153
+ "properties": {
154
+ "content": {
155
+ "type": "string",
156
+ "description": "Provide the final answer or response to complete the task.",
157
+ }
158
+ },
159
+ "required": [],
160
+ },
161
+ "description": "This function is used to indicate the completion of a task by providing the final answer or response.",
162
+ },
163
+ {
164
+ "type": "function",
165
+ "name": "press",
166
+ "parameters": {
167
+ "type": "object",
168
+ "properties": {
169
+ "key": {
170
+ "type": "string",
171
+ "description": "Key you want to press. Only one key can be pressed at one time.",
172
+ }
173
+ },
174
+ "required": ["key"],
175
+ },
176
+ "description": "Press key.",
177
+ },
178
+ {
179
+ "type": "function",
180
+ "name": "release",
181
+ "parameters": {
182
+ "type": "object",
183
+ "properties": {
184
+ "key": {
185
+ "type": "string",
186
+ "description": "Key you want to release. Only one key can be released at one time.",
187
+ }
188
+ },
189
+ "required": ["key"],
190
+ },
191
+ "description": "Release key.",
192
+ },
193
+ {
194
+ "type": "function",
195
+ "name": "mouse_down",
196
+ "parameters": {
197
+ "type": "object",
198
+ "properties": {
199
+ "point": {
200
+ "type": "string",
201
+ "description": "Mouse down position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>",
202
+ },
203
+ "button": {
204
+ "type": "string",
205
+ "description": "Down button. Default to left.",
206
+ "enum": ["left", "right"],
207
+ },
208
+ },
209
+ "required": [],
210
+ },
211
+ "description": "Mouse down action.",
212
+ },
213
+ {
214
+ "type": "function",
215
+ "name": "mouse_up",
216
+ "parameters": {
217
+ "type": "object",
218
+ "properties": {
219
+ "point": {
220
+ "type": "string",
221
+ "description": "Mouse up position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>",
222
+ },
223
+ "button": {
224
+ "type": "string",
225
+ "description": "Up button. Default to left.",
226
+ "enum": ["left", "right"],
227
+ },
228
+ },
229
+ "required": [],
230
+ },
231
+ "description": "Mouse up action.",
232
+ },
233
+ {
234
+ "type": "function",
235
+ "name": "call_user",
236
+ "parameters": {
237
+ "type": "object",
238
+ "properties": {
239
+ "content": {
240
+ "type": "string",
241
+ "description": "Message or information displayed to the user to request their input, feedback, or guidance.",
242
+ }
243
+ },
244
+ "required": [],
245
+ },
246
+ "description": "This function is used to interact with the user by displaying a message and requesting their input, feedback, or guidance.",
247
+ },
248
+ {
249
+ "type": "function",
250
+ "name": "wait",
251
+ "parameters": {
252
+ "type": "object",
253
+ "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}},
254
+ "required": [],
255
+ },
256
+ "description": "Wait for a while.",
257
+ },
258
+ {
259
+ "type": "function",
260
+ "name": "drag",
261
+ "parameters": {
262
+ "type": "object",
263
+ "properties": {
264
+ "start_point": {
265
+ "type": "string",
266
+ "description": "Drag start point. The format is: <point>x y</point>",
267
+ },
268
+ "end_point": {
269
+ "type": "string",
270
+ "description": "Drag end point. The format is: <point>x y</point>",
271
+ },
272
+ },
273
+ "required": ["start_point", "end_point"],
274
+ },
275
+ "description": "Mouse left button drag action.",
276
+ },
277
+ {
278
+ "type": "function",
279
+ "name": "type",
280
+ "parameters": {
281
+ "type": "object",
282
+ "properties": {
283
+ "content": {
284
+ "type": "string",
285
+ "description": "Type content. If you want to submit your input, use \\n at the end of content.",
286
+ }
287
+ },
288
+ "required": ["content"],
289
+ },
290
+ "description": "Type content.",
291
+ },
292
+ {
293
+ "type": "function",
294
+ "name": "take_screenshot",
295
+ "parameters": {},
296
+ "description": "Take screenshot.",
297
+ },
298
+ ]
299
+
300
+
301
+ def _format_tool_schemas_json_lines(schemas: List[Dict[str, Any]]) -> str:
302
+ # Nicely formatted: pretty JSON with indentation, separated by blank lines
303
+ return "\n\n".join(json.dumps(s, ensure_ascii=False, indent=2) for s in schemas) + "\n\n"
304
+
305
+
306
+ _PROMPT_PREFIX = (
307
+ "You should begin by detailing the internal reasoning process, and then present the answer to the user. "
308
+ "The reasoning process should be enclosed within <think_never_used_51bce0c785ca2f68081bfa7d91973934> "
309
+ "</think_never_used_51bce0c785ca2f68081bfa7d91973934> tags, as follows:\n"
310
+ "<think_never_used_51bce0c785ca2f68081bfa7d91973934> reasoning process here "
311
+ "</think_never_used_51bce0c785ca2f68081bfa7d91973934> answer here.\n\n"
312
+ "You have different modes of thinking:\n"
313
+ "Unrestricted think mode: Engage in an internal thinking process with thorough reasoning and reflections. "
314
+ "You have an unlimited budget for thinking tokens and can continue thinking until you fully solve the problem.\n"
315
+ "Efficient think mode: Provide a concise internal thinking process with efficient reasoning and reflections. "
316
+ "You don't have a strict token budget but be less verbose and more direct in your thinking.\n"
317
+ "No think mode: Respond directly to the question without any internal reasoning process or extra thinking tokens. "
318
+ "Still follow the template with the minimum required thinking tokens to justify the answer.\n"
319
+ "Budgeted think mode: Limit your internal reasoning and reflections to stay within the specified token budget\n\n"
320
+ "Based on the complexity of the problem, select the appropriate mode for reasoning among the provided options listed below.\n\n"
321
+ "Provided Mode(s):\nEfficient think.\n\n"
322
+ "You are provided with a task description, a history of previous actions, and corresponding screenshots. "
323
+ "Your goal is to perform the next action to complete the task. "
324
+ "If performing the same action multiple times results in a static screen with no changes, attempt a modified or alternative action.\n\n"
325
+ "## Function Definition\n\n"
326
+ "- You have access to the following functions:\n\n"
327
+ )
328
+
329
+ _PROMPT_SUFFIX = (
330
+ "- To call a function, use the following structure without any suffix:\n\n"
331
+ "<gui_think> reasoning process </gui_think>\n"
332
+ "<seed:tool_call><function=example_function_name><parameter=example_parameter_1>value_1</parameter>"
333
+ "<parameter=example_parameter_2>multiline...\n</parameter></function></seed:tool_call>\n\n"
334
+ "## Important Notes\n"
335
+ "- Function calls must begin with <function= and end with </function>.\n"
336
+ "- All required parameters must be explicitly provided.\n"
337
+ "\n## Additional Notes\n"
338
+ "- You can execute multiple actions within a single tool call. For example:\n"
339
+ "<seed:tool_call><function=example_function_1><parameter=example_parameter_1>value_1</parameter><parameter=example_parameter_2>\n"
340
+ "This is the value for the second parameter\nthat can span\nmultiple lines\n"
341
+ "</parameter></function><function=example_function_2><parameter=example_parameter_3>value_4</parameter></function></seed:tool_call>"
342
+ )
343
+
344
+
345
+ SYSTEM_PROMPT = _PROMPT_PREFIX + _format_tool_schemas_json_lines(TOOL_SCHEMAS) + _PROMPT_SUFFIX
346
+
347
+
348
+ def _extract_function_schemas_from_tools(
349
+ tools: Optional[List[Dict[str, Any]]],
350
+ ) -> List[Dict[str, Any]]:
351
+ schemas: List[Dict[str, Any]] = []
352
+ if not tools:
353
+ return schemas
354
+ for t in tools:
355
+ if t.get("type") == "function":
356
+ fn = t.get("function", {})
357
+ name = fn.get("name")
358
+ params = fn.get("parameters", {})
359
+ desc = fn.get("description", "")
360
+ if name:
361
+ schemas.append(
362
+ {
363
+ "type": "function",
364
+ "name": name,
365
+ "parameters": params if isinstance(params, dict) else {},
366
+ "description": desc,
367
+ }
368
+ )
369
+ return schemas
370
+
371
+
372
+ def _parse_seed_tool_calls(text: str) -> List[Dict[str, Any]]:
373
+ """Parse <seed:tool_call> blocks into a list of {function, parameters} dicts.
374
+ Also captures optional <gui_think>...</gui_think> as reasoning.
375
+ """
376
+ actions: List[Dict[str, Any]] = []
377
+ if not text:
378
+ return actions
379
+
380
+ # Extract reasoning if present
381
+ reasoning_text = None
382
+ think_match = re.search(r"<gui_think>([\s\S]*?)</gui_think>", text)
383
+ if think_match:
384
+ reasoning_text = think_match.group(1).strip()
385
+
386
+ # Iterate each seed tool_call block
387
+ for block in re.finditer(r"<seed:tool_call>([\s\S]*?)</seed:tool_call>", text):
388
+ content = block.group(1)
389
+ # One or multiple <function=...>...</function> inside
390
+ for fmatch in re.finditer(r"<function=([\w_]+)>([\s\S]*?)</function>", content):
391
+ fname = fmatch.group(1)
392
+ inner = fmatch.group(2)
393
+ params: Dict[str, str] = {}
394
+ for pmatch in re.finditer(r"<parameter=([\w_]+)>([\s\S]*?)</parameter>", inner):
395
+ pname = pmatch.group(1)
396
+ pval = pmatch.group(2).strip()
397
+ params[pname] = pval
398
+ actions.append({"function": fname, "parameters": params})
399
+
400
+ # If we have a global reasoning and at least one action, attach it to first
401
+ if reasoning_text and actions:
402
+ actions[0]["reasoning"] = reasoning_text
403
+ elif reasoning_text:
404
+ actions.append({"function": "reasoning", "parameters": {"content": reasoning_text}})
405
+
406
+ return actions
407
+
408
+
409
+ def _normalize_xy_to_uitars(x: int, y: int, width: int, height: int) -> Tuple[int, int]:
410
+ width = max(1, int(width))
411
+ height = max(1, int(height))
412
+ nx = max(0, min(1000, int(round((x / width) * 1000))))
413
+ ny = max(0, min(1000, int(round((y / height) * 1000))))
414
+ return nx, ny
415
+
416
+
417
+ def _denormalize_xy_from_uitars(nx: float, ny: float, width: int, height: int) -> Tuple[int, int]:
418
+ width = max(1, int(width))
419
+ height = max(1, int(height))
420
+ x = int(round((nx / 1000.0) * width))
421
+ y = int(round((ny / 1000.0) * height))
422
+ return x, y
423
+
424
+
425
+ def _map_computer_action_to_function(
426
+ action: Dict[str, Any], width: int, height: int
427
+ ) -> Optional[Dict[str, Any]]:
428
+ """Map a computer action item to a UITARS function + parameters dict of strings.
429
+ Returns dict like {"function": name, "parameters": {..}} or None if unknown.
430
+ """
431
+ atype = action.get("type") or action.get("action")
432
+ if atype == "click":
433
+ x, y = action.get("x"), action.get("y")
434
+ btn = action.get("button", "left")
435
+ if x is None or y is None:
436
+ return None
437
+ nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
438
+ if btn == "right":
439
+ return {
440
+ "function": "right_single",
441
+ "parameters": {"point": f"<point>{nx} {ny}</point>"},
442
+ }
443
+ return {"function": "click", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
444
+ if atype == "double_click":
445
+ x, y = action.get("x"), action.get("y")
446
+ if x is None or y is None:
447
+ return None
448
+ nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
449
+ return {"function": "left_double", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
450
+ if atype == "move":
451
+ x, y = action.get("x"), action.get("y")
452
+ if x is None or y is None:
453
+ return None
454
+ nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
455
+ return {"function": "move_to", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
456
+ if atype == "keypress":
457
+ keys = action.get("keys", [])
458
+ if isinstance(keys, list) and keys:
459
+ if len(keys) == 1:
460
+ return {"function": "press", "parameters": {"key": keys[0]}}
461
+ else:
462
+ return {"function": "hotkey", "parameters": {"key": " ".join(keys)}}
463
+ return None
464
+ if atype == "type":
465
+ text = action.get("text", "")
466
+ return {"function": "type", "parameters": {"content": text}}
467
+ if atype == "scroll":
468
+ x, y = action.get("x", 512), action.get("y", 512)
469
+ nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
470
+ sx, sy = action.get("scroll_x", 0), action.get("scroll_y", 0)
471
+ # Our parser used positive sy for up
472
+ direction = (
473
+ "up"
474
+ if sy and sy > 0
475
+ else (
476
+ "down"
477
+ if sy and sy < 0
478
+ else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down"))
479
+ )
480
+ )
481
+ return {
482
+ "function": "scroll",
483
+ "parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"},
484
+ }
485
+ if atype == "drag":
486
+ path = action.get("path", [])
487
+ if isinstance(path, list) and len(path) >= 2:
488
+ sx, sy = path[0].get("x"), path[0].get("y")
489
+ ex, ey = path[-1].get("x"), path[-1].get("y")
490
+ if sx is None or sy is None or ex is None or ey is None:
491
+ return None
492
+ nsx, nsy = _normalize_xy_to_uitars(int(sx), int(sy), width, height)
493
+ nex, ney = _normalize_xy_to_uitars(int(ex), int(ey), width, height)
494
+ return {
495
+ "function": "drag",
496
+ "parameters": {
497
+ "start_point": f"<point>{nsx} {nsy}</point>",
498
+ "end_point": f"<point>{nex} {ney}</point>",
499
+ },
500
+ }
501
+ return None
502
+ if atype == "wait":
503
+ return {"function": "wait", "parameters": {}}
504
+ if atype == "screenshot":
505
+ return {"function": "take_screenshot", "parameters": {}}
506
+ # Fallback unknown
507
+ return None
508
+
509
+
510
+ def _to_uitars_messages(
511
+ messages: List[Dict[str, Any]], width: int, height: int
512
+ ) -> List[Dict[str, Any]]:
513
+ """Convert responses items into completion messages tailored for UI-TARS.
514
+
515
+ - User content is passed through similar to convert_responses_items_to_completion_messages
516
+ - Assistant/tool history is rendered as text with <gui_think> and <seed:tool_call> blocks
517
+ """
518
+ uitars_messages: List[Dict[str, Any]] = []
519
+
520
+ def flush_seed_block(pending_think: Optional[str], pending_functions: List[Dict[str, Any]]):
521
+ if not pending_think and not pending_functions:
522
+ return
523
+ parts: List[str] = []
524
+ if pending_think:
525
+ parts.append(f"<gui_think> {pending_think} </gui_think>")
526
+ if pending_functions:
527
+ inner = []
528
+ for f in pending_functions:
529
+ fname = f["function"]
530
+ params = f.get("parameters", {})
531
+ param_blocks = []
532
+ for k, v in params.items():
533
+ param_blocks.append(f"<parameter={k}>{v}</parameter>")
534
+ inner.append(f"<function={fname}>{''.join(param_blocks)}</function>")
535
+ parts.append(f"<seed:tool_call>{''.join(inner)}</seed:tool_call>")
536
+ uitars_messages.append({"role": "assistant", "content": "".join(parts)})
537
+
538
+ # Accumulators for a single assistant seed block
539
+ pending_think: Optional[str] = None
540
+ pending_functions: List[Dict[str, Any]] = []
541
+
542
+ for msg in messages:
543
+ mtype = msg.get("type")
544
+ role = msg.get("role")
545
+
546
+ # On any user message, flush current assistant block
547
+ if role == "user" or mtype == "user":
548
+ flush_seed_block(pending_think, pending_functions)
549
+ pending_think, pending_functions = None, []
550
+
551
+ content = msg.get("content", "")
552
+ if isinstance(content, list):
553
+ completion_content = []
554
+ for item in content:
555
+ if item.get("type") == "input_image":
556
+ completion_content.append(
557
+ {"type": "image_url", "image_url": {"url": item.get("image_url")}}
558
+ )
559
+ elif item.get("type") in ("input_text", "text"):
560
+ completion_content.append({"type": "text", "text": item.get("text")})
561
+ uitars_messages.append({"role": "user", "content": completion_content})
562
+ elif isinstance(content, str):
563
+ uitars_messages.append({"role": "user", "content": content})
564
+ continue
565
+
566
+ # Reasoning item
567
+ if mtype == "reasoning":
568
+ # Responses reasoning stores summary list
569
+ summary = msg.get("summary", [])
570
+ texts = [
571
+ s.get("text", "")
572
+ for s in summary
573
+ if isinstance(s, dict) and s.get("type") == "summary_text"
574
+ ]
575
+ if texts:
576
+ pending_think = "\n".join([t for t in texts if t])
577
+ continue
578
+
579
+ # Computer/tool calls -> map to functions
580
+ if mtype == "computer_call":
581
+ f = _map_computer_action_to_function(msg.get("action", {}), width, height)
582
+ if f:
583
+ pending_functions.append(f)
584
+ continue
585
+ if mtype == "function_call":
586
+ # Include custom tools as-is
587
+ name = msg.get("name")
588
+ try:
589
+ args_obj = json.loads(msg.get("arguments", "{}"))
590
+ except json.JSONDecodeError:
591
+ args_obj = {}
592
+ # Ensure string values
593
+ params = {k: (str(v) if not isinstance(v, str) else v) for k, v in args_obj.items()}
594
+ pending_functions.append({"function": name, "parameters": params})
595
+ continue
596
+
597
+ # If assistant message text is given, flush current block and add as plain assistant text
598
+ if role == "assistant" or mtype == "message":
599
+ flush_seed_block(pending_think, pending_functions)
600
+ pending_think, pending_functions = None, []
601
+ content = msg.get("content", [])
602
+ if isinstance(content, list):
603
+ texts = [
604
+ c.get("text", "")
605
+ for c in content
606
+ if isinstance(c, dict) and c.get("type") in ("output_text", "text")
607
+ ]
608
+ if texts:
609
+ uitars_messages.append(
610
+ {"role": "assistant", "content": "\n".join([t for t in texts if t])}
611
+ )
612
+ elif isinstance(content, str) and content:
613
+ uitars_messages.append({"role": "assistant", "content": content})
614
+ continue
615
+
616
+ # On outputs, flush pending assistant block and send outputs as user messages
617
+ if mtype in ("function_call_output", "computer_call_output"):
618
+ flush_seed_block(pending_think, pending_functions)
619
+ pending_think, pending_functions = None, []
620
+ output = msg.get("output")
621
+ if isinstance(output, dict) and output.get("type") == "input_image":
622
+ img_url = output.get("image_url")
623
+ if img_url:
624
+ uitars_messages.append(
625
+ {
626
+ "role": "user",
627
+ "content": [
628
+ {"type": "image_url", "image_url": {"url": img_url}},
629
+ ],
630
+ }
631
+ )
632
+ elif isinstance(output, str):
633
+ uitars_messages.append({"role": "user", "content": output})
634
+ else:
635
+ # Fallback stringify
636
+ uitars_messages.append({"role": "user", "content": json.dumps(output)})
637
+ continue
638
+
639
+ # Flush any remaining pending seed block
640
+ flush_seed_block(pending_think, pending_functions)
641
+
642
+ return uitars_messages
643
+
644
+
645
+ def _to_response_items(
646
+ actions: List[Dict[str, Any]],
647
+ tool_names: Optional[set[str]] = None,
648
+ width: Optional[int] = None,
649
+ height: Optional[int] = None,
650
+ ) -> List[Any]:
651
+ """Map parsed actions into Responses items (computer actions + optional reasoning)."""
652
+ items: List[Any] = []
653
+ tool_names = tool_names or set()
654
+
655
+ # Optional top-level reasoning attached to first
656
+ if actions and actions[0].get("reasoning"):
657
+ items.append(make_reasoning_item(actions[0]["reasoning"]))
658
+
659
+ # Dimensions default
660
+ w = int(width) if width else 1024
661
+ h = int(height) if height else 768
662
+
663
+ for a in actions:
664
+ fn = a.get("function")
665
+ params = a.get("parameters", {})
666
+ if fn == "reasoning":
667
+ items.append(make_reasoning_item(params.get("content", "")))
668
+ elif fn in ("click", "left_double", "right_single"):
669
+ # params.point is like: <point>x y</point> or plain "x y"
670
+ point = params.get("point", "").strip()
671
+ m = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", point)
672
+ if not m:
673
+ continue
674
+ nx = float(m.group(1))
675
+ ny = float(m.group(2))
676
+ x, y = _denormalize_xy_from_uitars(nx, ny, w, h)
677
+ if fn == "left_double":
678
+ items.append(make_double_click_item(x, y))
679
+ elif fn == "right_single":
680
+ items.append(make_click_item(x, y, "right"))
681
+ else:
682
+ items.append(make_click_item(x, y, "left"))
683
+ elif fn == "move_to":
684
+ point = params.get("point", "").strip()
685
+ m = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", point)
686
+ if not m:
687
+ continue
688
+ nx = float(m.group(1))
689
+ ny = float(m.group(2))
690
+ x, y = _denormalize_xy_from_uitars(nx, ny, w, h)
691
+ items.append(make_move_item(x, y))
692
+ elif fn == "drag":
693
+ sp = params.get("start_point", "").strip()
694
+ ep = params.get("end_point", "").strip()
695
+ ms = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", sp)
696
+ me = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", ep)
697
+ if not (ms and me):
698
+ continue
699
+ nsx, nsy = float(ms.group(1)), float(ms.group(2))
700
+ nex, ney = float(me.group(1)), float(me.group(2))
701
+ sx, sy = _denormalize_xy_from_uitars(nsx, nsy, w, h)
702
+ ex, ey = _denormalize_xy_from_uitars(nex, ney, w, h)
703
+ items.append(make_drag_item([{"x": sx, "y": sy}, {"x": ex, "y": ey}]))
704
+ elif fn == "hotkey":
705
+ key = params.get("key", "")
706
+ keys = key.split()
707
+ if keys:
708
+ items.append(make_keypress_item(keys))
709
+ elif fn == "press":
710
+ key = params.get("key", "")
711
+ if key:
712
+ items.append(make_keypress_item([key]))
713
+ elif fn == "type":
714
+ content = params.get("content", "")
715
+ items.append(make_type_item(content))
716
+ elif fn == "scroll":
717
+ # direction: up/down/left/right. Point optional
718
+ direction = params.get("direction", "down").lower()
719
+ point = params.get("point", "")
720
+ m = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", point)
721
+ if m:
722
+ nx = float(m.group(1))
723
+ ny = float(m.group(2))
724
+ x, y = _denormalize_xy_from_uitars(nx, ny, w, h)
725
+ else:
726
+ x, y = _denormalize_xy_from_uitars(500.0, 500.0, w, h)
727
+ dy = 5 if direction == "up" else -5
728
+ dx = 5 if direction == "right" else (-5 if direction == "left" else 0)
729
+ items.append(make_scroll_item(x, y, dx, dy))
730
+ elif fn == "wait":
731
+ items.append(make_wait_item())
732
+ elif fn == "finished":
733
+ content = params.get("content", "")
734
+ items.append(make_output_text_item(content or "Task completed."))
735
+ break
736
+ elif fn == "take_screenshot":
737
+ items.append(make_screenshot_item())
738
+ elif fn == "open_computer":
739
+ items.append(make_screenshot_item())
740
+ else:
741
+ # If this function name is present in provided tool schemas, emit function_call
742
+ if fn in tool_names:
743
+ # Convert simple string params into an arguments object
744
+ # Parameters are strings; pass through as-is
745
+ items.append(make_function_call_item(fn, params))
746
+ else:
747
+ # Unknown function -> surface as assistant text
748
+ items.append(make_output_text_item(f"Unknown action: {fn} {params}"))
749
+
750
+ return items
751
+
752
+
753
+ @register_agent(models=r"(?i).*ui-?tars-?2.*")
754
+ class UITARS2Config:
755
+ async def predict_step(
756
+ self,
757
+ messages: List[Dict[str, Any]],
758
+ model: str,
759
+ tools: Optional[List[Dict[str, Any]]] = None,
760
+ max_retries: Optional[int] = None,
761
+ stream: bool = False,
762
+ computer_handler=None,
763
+ use_prompt_caching: Optional[bool] = False,
764
+ _on_api_start=None,
765
+ _on_api_end=None,
766
+ _on_usage=None,
767
+ _on_screenshot=None,
768
+ **kwargs,
769
+ ) -> Dict[str, Any]:
770
+ # Determine screen dimensions (prefer computer_handler, fallback to last screenshot)
771
+ width: Optional[int] = None
772
+ height: Optional[int] = None
773
+ if computer_handler is not None and hasattr(computer_handler, "get_dimensions"):
774
+ try:
775
+ dims = await computer_handler.get_dimensions() # type: ignore
776
+ if isinstance(dims, (list, tuple)) and len(dims) == 2:
777
+ width, height = int(dims[0]), int(dims[1])
778
+ except Exception:
779
+ pass
780
+
781
+ if width is None or height is None:
782
+ try:
783
+ last_out = get_last_computer_call_output(messages) # type: ignore
784
+ if last_out:
785
+ image_url = last_out.get("output", {}).get("image_url", "")
786
+ if image_url:
787
+ b64 = image_url.split(",")[-1]
788
+ img_bytes = base64.b64decode(b64)
789
+ if Image is not None:
790
+ img = Image.open(io.BytesIO(img_bytes))
791
+ width, height = img.size
792
+ except Exception:
793
+ pass
794
+
795
+ if width is None or height is None:
796
+ width, height = 1024, 768
797
+
798
+ # Convert Responses items to UI-TARS style messages with <seed:tool_call> history
799
+ completion_messages = _to_uitars_messages(messages, width, height)
800
+
801
+ # Build dynamic system prompt by concatenating built-in schemas and provided function tools
802
+ provided_fn_schemas = _extract_function_schemas_from_tools(tools)
803
+ combined_schemas = (
804
+ TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
805
+ )
806
+ dynamic_system_prompt = (
807
+ _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
808
+ )
809
+
810
+ # Prepend system prompt (based on training prompts + provided tools)
811
+ litellm_messages: List[Dict[str, Any]] = [
812
+ {"role": "system", "content": dynamic_system_prompt},
813
+ ]
814
+ litellm_messages.extend(completion_messages)
815
+
816
+ api_kwargs: Dict[str, Any] = {
817
+ "model": model,
818
+ "messages": litellm_messages,
819
+ "max_retries": max_retries,
820
+ "stream": stream,
821
+ **{k: v for k, v in kwargs.items()},
822
+ }
823
+ if use_prompt_caching:
824
+ api_kwargs["use_prompt_caching"] = use_prompt_caching
825
+
826
+ if _on_api_start:
827
+ await _on_api_start(api_kwargs)
828
+
829
+ response = await litellm.acompletion(**api_kwargs)
830
+
831
+ if _on_api_end:
832
+ await _on_api_end(api_kwargs, response)
833
+
834
+ usage = {
835
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage( # type: ignore
836
+ response.usage
837
+ ).model_dump(),
838
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
839
+ }
840
+ if _on_usage:
841
+ await _on_usage(usage)
842
+
843
+ # Extract text content (first choice)
844
+ response_dict = response.model_dump() # type: ignore
845
+ content_text = ""
846
+ choices = response_dict.get("choices", [])
847
+ if choices:
848
+ msg = choices[0].get("message", {})
849
+ # message.content may be string or array; gather text pieces
850
+ mc = msg.get("content")
851
+ if isinstance(mc, str):
852
+ content_text = mc
853
+ elif isinstance(mc, list):
854
+ parts = []
855
+ for part in mc:
856
+ if isinstance(part, dict) and part.get("type") == "text":
857
+ parts.append(part.get("text", ""))
858
+ content_text = "\n".join([p for p in parts if p])
859
+
860
+ # Parse the seed tool calls and map to response items
861
+ actions = _parse_seed_tool_calls(content_text)
862
+ # Build set of tool names from provided tools to emit function_call items
863
+ tool_names: set[str] = set()
864
+ for s in provided_fn_schemas:
865
+ name = s.get("name")
866
+ if isinstance(name, str):
867
+ tool_names.add(name)
868
+ output_items = _to_response_items(actions, tool_names, width, height)
869
+
870
+ return {"output": output_items, "usage": usage}
871
+
872
+ def get_capabilities(self) -> List[AgentCapability]:
873
+ return ["step"]
874
+
875
+ async def predict_click(
876
+ self, model: str, image_b64: str, instruction: str, **kwargs
877
+ ) -> Optional[Tuple[int, int]]:
878
+ """Predict a single click coordinate using a minimal prompt with a click tool.
879
+
880
+ This sends the current screenshot and instruction, asking the model to
881
+ output a click action in the form:
882
+ Action: click(point='(x,y)')
883
+ """
884
+ # Minimal grounding-style prompt
885
+ system_text = (
886
+ "You are a GUI agent. Given the instruction, return a single action on the current screen.\n\n"
887
+ "## Output Format\n\n"
888
+ "Action: click(point='(x,y)')\n\n"
889
+ "## User Instruction\n"
890
+ f"{instruction}"
891
+ )
892
+
893
+ # Build messages with image
894
+ litellm_messages: List[Dict[str, Any]] = [
895
+ {"role": "system", "content": system_text},
896
+ {
897
+ "role": "user",
898
+ "content": [
899
+ {"type": "text", "text": "Please return a single click action."},
900
+ {
901
+ "type": "image_url",
902
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
903
+ },
904
+ ],
905
+ },
906
+ ]
907
+
908
+ api_kwargs: Dict[str, Any] = {
909
+ "model": model,
910
+ "messages": litellm_messages,
911
+ "max_tokens": kwargs.get("max_tokens", 512),
912
+ "temperature": kwargs.get("temperature", 0.0),
913
+ "do_sample": kwargs.get("temperature", 0.0) > 0.0,
914
+ }
915
+ api_kwargs.update(
916
+ {k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]}
917
+ )
918
+
919
+ response = await litellm.acompletion(**api_kwargs)
920
+ # Extract response content
921
+ response_dict = response.model_dump() # type: ignore
922
+ choices = response_dict.get("choices", [])
923
+ if not choices:
924
+ return None
925
+ msg = choices[0].get("message", {})
926
+ content_text = msg.get("content", "")
927
+ if isinstance(content_text, list):
928
+ text_parts = [
929
+ p.get("text", "")
930
+ for p in content_text
931
+ if isinstance(p, dict) and p.get("type") == "text"
932
+ ]
933
+ content_text = "\n".join([t for t in text_parts if t])
934
+ if not isinstance(content_text, str):
935
+ return None
936
+
937
+ # Parse coordinates
938
+ # Pattern for click(point='(x,y)') or click(start_box='(x,y)')
939
+ patterns = [
940
+ r"click\(point='\((\d+),(\d+)\)'\)",
941
+ r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)",
942
+ ]
943
+ for pat in patterns:
944
+ m = re.search(pat, content_text)
945
+ if m:
946
+ try:
947
+ x, y = int(m.group(1)), int(m.group(2))
948
+ return (x, y)
949
+ except Exception:
950
+ pass
951
+ return None