cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/uitars2.py
ADDED
|
@@ -0,0 +1,951 @@
|
|
|
1
|
+
"""
|
|
2
|
+
UITARS-2 agent loop implementation using LiteLLM.
|
|
3
|
+
- Prepends a system prompt modeled after the training prompts in examples/seed_16_gui.ipynb
|
|
4
|
+
- Converts Responses items -> completion messages
|
|
5
|
+
- Calls litellm.acompletion
|
|
6
|
+
- Parses <seed:tool_call> ... </seed:tool_call> outputs back into Responses items (computer actions)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import base64
|
|
12
|
+
import io
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
import litellm
|
|
18
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
19
|
+
LiteLLMCompletionResponsesConfig,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from ..decorators import register_agent
|
|
23
|
+
from .omniparser import get_last_computer_call_output # type: ignore
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from PIL import Image # type: ignore
|
|
27
|
+
except Exception: # pragma: no cover
|
|
28
|
+
Image = None # type: ignore
|
|
29
|
+
from ..responses import (
|
|
30
|
+
convert_responses_items_to_completion_messages,
|
|
31
|
+
make_click_item,
|
|
32
|
+
make_double_click_item,
|
|
33
|
+
make_drag_item,
|
|
34
|
+
make_function_call_item,
|
|
35
|
+
make_keypress_item,
|
|
36
|
+
make_move_item,
|
|
37
|
+
make_output_text_item,
|
|
38
|
+
make_reasoning_item,
|
|
39
|
+
make_screenshot_item,
|
|
40
|
+
make_scroll_item,
|
|
41
|
+
make_type_item,
|
|
42
|
+
make_wait_item,
|
|
43
|
+
)
|
|
44
|
+
from ..types import AgentCapability
|
|
45
|
+
|
|
46
|
+
TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
|
47
|
+
{
|
|
48
|
+
"type": "function",
|
|
49
|
+
"name": "open_computer",
|
|
50
|
+
"parameters": {},
|
|
51
|
+
"description": "Open computer.",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"type": "function",
|
|
55
|
+
"name": "click",
|
|
56
|
+
"parameters": {
|
|
57
|
+
"type": "object",
|
|
58
|
+
"properties": {
|
|
59
|
+
"point": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"description": "Click coordinates. The format is: <point>x y</point>",
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
"required": ["point"],
|
|
65
|
+
},
|
|
66
|
+
"description": "Mouse left single click action.",
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"type": "function",
|
|
70
|
+
"name": "left_double",
|
|
71
|
+
"parameters": {
|
|
72
|
+
"type": "object",
|
|
73
|
+
"properties": {
|
|
74
|
+
"point": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"description": "Click coordinates. The format is: <point>x y</point>",
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
"required": ["point"],
|
|
80
|
+
},
|
|
81
|
+
"description": "Mouse left double click action.",
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"type": "function",
|
|
85
|
+
"name": "right_single",
|
|
86
|
+
"parameters": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"properties": {
|
|
89
|
+
"point": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"description": "Click coordinates. The format is: <point>x y</point>",
|
|
92
|
+
}
|
|
93
|
+
},
|
|
94
|
+
"required": ["point"],
|
|
95
|
+
},
|
|
96
|
+
"description": "Mouse right single click action.",
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"type": "function",
|
|
100
|
+
"name": "scroll",
|
|
101
|
+
"parameters": {
|
|
102
|
+
"type": "object",
|
|
103
|
+
"properties": {
|
|
104
|
+
"point": {
|
|
105
|
+
"type": "string",
|
|
106
|
+
"description": "Scroll start position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>",
|
|
107
|
+
},
|
|
108
|
+
"direction": {
|
|
109
|
+
"type": "string",
|
|
110
|
+
"description": "Scroll direction.",
|
|
111
|
+
"enum": ["up", "down", "left", "right"],
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
"required": ["direction"],
|
|
115
|
+
},
|
|
116
|
+
"description": "Scroll action.",
|
|
117
|
+
},
|
|
118
|
+
{
|
|
119
|
+
"type": "function",
|
|
120
|
+
"name": "move_to",
|
|
121
|
+
"parameters": {
|
|
122
|
+
"type": "object",
|
|
123
|
+
"properties": {
|
|
124
|
+
"point": {
|
|
125
|
+
"type": "string",
|
|
126
|
+
"description": "Target coordinates. The format is: <point>x y</point>",
|
|
127
|
+
}
|
|
128
|
+
},
|
|
129
|
+
"required": ["point"],
|
|
130
|
+
},
|
|
131
|
+
"description": "Mouse move action.",
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"type": "function",
|
|
135
|
+
"name": "hotkey",
|
|
136
|
+
"parameters": {
|
|
137
|
+
"type": "object",
|
|
138
|
+
"properties": {
|
|
139
|
+
"key": {
|
|
140
|
+
"type": "string",
|
|
141
|
+
"description": "Hotkeys you want to press. Split keys with a space and use lowercase.",
|
|
142
|
+
}
|
|
143
|
+
},
|
|
144
|
+
"required": ["key"],
|
|
145
|
+
},
|
|
146
|
+
"description": "Press hotkey.",
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
"type": "function",
|
|
150
|
+
"name": "finished",
|
|
151
|
+
"parameters": {
|
|
152
|
+
"type": "object",
|
|
153
|
+
"properties": {
|
|
154
|
+
"content": {
|
|
155
|
+
"type": "string",
|
|
156
|
+
"description": "Provide the final answer or response to complete the task.",
|
|
157
|
+
}
|
|
158
|
+
},
|
|
159
|
+
"required": [],
|
|
160
|
+
},
|
|
161
|
+
"description": "This function is used to indicate the completion of a task by providing the final answer or response.",
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
"type": "function",
|
|
165
|
+
"name": "press",
|
|
166
|
+
"parameters": {
|
|
167
|
+
"type": "object",
|
|
168
|
+
"properties": {
|
|
169
|
+
"key": {
|
|
170
|
+
"type": "string",
|
|
171
|
+
"description": "Key you want to press. Only one key can be pressed at one time.",
|
|
172
|
+
}
|
|
173
|
+
},
|
|
174
|
+
"required": ["key"],
|
|
175
|
+
},
|
|
176
|
+
"description": "Press key.",
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
"type": "function",
|
|
180
|
+
"name": "release",
|
|
181
|
+
"parameters": {
|
|
182
|
+
"type": "object",
|
|
183
|
+
"properties": {
|
|
184
|
+
"key": {
|
|
185
|
+
"type": "string",
|
|
186
|
+
"description": "Key you want to release. Only one key can be released at one time.",
|
|
187
|
+
}
|
|
188
|
+
},
|
|
189
|
+
"required": ["key"],
|
|
190
|
+
},
|
|
191
|
+
"description": "Release key.",
|
|
192
|
+
},
|
|
193
|
+
{
|
|
194
|
+
"type": "function",
|
|
195
|
+
"name": "mouse_down",
|
|
196
|
+
"parameters": {
|
|
197
|
+
"type": "object",
|
|
198
|
+
"properties": {
|
|
199
|
+
"point": {
|
|
200
|
+
"type": "string",
|
|
201
|
+
"description": "Mouse down position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>",
|
|
202
|
+
},
|
|
203
|
+
"button": {
|
|
204
|
+
"type": "string",
|
|
205
|
+
"description": "Down button. Default to left.",
|
|
206
|
+
"enum": ["left", "right"],
|
|
207
|
+
},
|
|
208
|
+
},
|
|
209
|
+
"required": [],
|
|
210
|
+
},
|
|
211
|
+
"description": "Mouse down action.",
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
"type": "function",
|
|
215
|
+
"name": "mouse_up",
|
|
216
|
+
"parameters": {
|
|
217
|
+
"type": "object",
|
|
218
|
+
"properties": {
|
|
219
|
+
"point": {
|
|
220
|
+
"type": "string",
|
|
221
|
+
"description": "Mouse up position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>",
|
|
222
|
+
},
|
|
223
|
+
"button": {
|
|
224
|
+
"type": "string",
|
|
225
|
+
"description": "Up button. Default to left.",
|
|
226
|
+
"enum": ["left", "right"],
|
|
227
|
+
},
|
|
228
|
+
},
|
|
229
|
+
"required": [],
|
|
230
|
+
},
|
|
231
|
+
"description": "Mouse up action.",
|
|
232
|
+
},
|
|
233
|
+
{
|
|
234
|
+
"type": "function",
|
|
235
|
+
"name": "call_user",
|
|
236
|
+
"parameters": {
|
|
237
|
+
"type": "object",
|
|
238
|
+
"properties": {
|
|
239
|
+
"content": {
|
|
240
|
+
"type": "string",
|
|
241
|
+
"description": "Message or information displayed to the user to request their input, feedback, or guidance.",
|
|
242
|
+
}
|
|
243
|
+
},
|
|
244
|
+
"required": [],
|
|
245
|
+
},
|
|
246
|
+
"description": "This function is used to interact with the user by displaying a message and requesting their input, feedback, or guidance.",
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
"type": "function",
|
|
250
|
+
"name": "wait",
|
|
251
|
+
"parameters": {
|
|
252
|
+
"type": "object",
|
|
253
|
+
"properties": {"time": {"type": "integer", "description": "Wait time in seconds."}},
|
|
254
|
+
"required": [],
|
|
255
|
+
},
|
|
256
|
+
"description": "Wait for a while.",
|
|
257
|
+
},
|
|
258
|
+
{
|
|
259
|
+
"type": "function",
|
|
260
|
+
"name": "drag",
|
|
261
|
+
"parameters": {
|
|
262
|
+
"type": "object",
|
|
263
|
+
"properties": {
|
|
264
|
+
"start_point": {
|
|
265
|
+
"type": "string",
|
|
266
|
+
"description": "Drag start point. The format is: <point>x y</point>",
|
|
267
|
+
},
|
|
268
|
+
"end_point": {
|
|
269
|
+
"type": "string",
|
|
270
|
+
"description": "Drag end point. The format is: <point>x y</point>",
|
|
271
|
+
},
|
|
272
|
+
},
|
|
273
|
+
"required": ["start_point", "end_point"],
|
|
274
|
+
},
|
|
275
|
+
"description": "Mouse left button drag action.",
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
"type": "function",
|
|
279
|
+
"name": "type",
|
|
280
|
+
"parameters": {
|
|
281
|
+
"type": "object",
|
|
282
|
+
"properties": {
|
|
283
|
+
"content": {
|
|
284
|
+
"type": "string",
|
|
285
|
+
"description": "Type content. If you want to submit your input, use \\n at the end of content.",
|
|
286
|
+
}
|
|
287
|
+
},
|
|
288
|
+
"required": ["content"],
|
|
289
|
+
},
|
|
290
|
+
"description": "Type content.",
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
"type": "function",
|
|
294
|
+
"name": "take_screenshot",
|
|
295
|
+
"parameters": {},
|
|
296
|
+
"description": "Take screenshot.",
|
|
297
|
+
},
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _format_tool_schemas_json_lines(schemas: List[Dict[str, Any]]) -> str:
|
|
302
|
+
# Nicely formatted: pretty JSON with indentation, separated by blank lines
|
|
303
|
+
return "\n\n".join(json.dumps(s, ensure_ascii=False, indent=2) for s in schemas) + "\n\n"
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
_PROMPT_PREFIX = (
|
|
307
|
+
"You should begin by detailing the internal reasoning process, and then present the answer to the user. "
|
|
308
|
+
"The reasoning process should be enclosed within <think_never_used_51bce0c785ca2f68081bfa7d91973934> "
|
|
309
|
+
"</think_never_used_51bce0c785ca2f68081bfa7d91973934> tags, as follows:\n"
|
|
310
|
+
"<think_never_used_51bce0c785ca2f68081bfa7d91973934> reasoning process here "
|
|
311
|
+
"</think_never_used_51bce0c785ca2f68081bfa7d91973934> answer here.\n\n"
|
|
312
|
+
"You have different modes of thinking:\n"
|
|
313
|
+
"Unrestricted think mode: Engage in an internal thinking process with thorough reasoning and reflections. "
|
|
314
|
+
"You have an unlimited budget for thinking tokens and can continue thinking until you fully solve the problem.\n"
|
|
315
|
+
"Efficient think mode: Provide a concise internal thinking process with efficient reasoning and reflections. "
|
|
316
|
+
"You don't have a strict token budget but be less verbose and more direct in your thinking.\n"
|
|
317
|
+
"No think mode: Respond directly to the question without any internal reasoning process or extra thinking tokens. "
|
|
318
|
+
"Still follow the template with the minimum required thinking tokens to justify the answer.\n"
|
|
319
|
+
"Budgeted think mode: Limit your internal reasoning and reflections to stay within the specified token budget\n\n"
|
|
320
|
+
"Based on the complexity of the problem, select the appropriate mode for reasoning among the provided options listed below.\n\n"
|
|
321
|
+
"Provided Mode(s):\nEfficient think.\n\n"
|
|
322
|
+
"You are provided with a task description, a history of previous actions, and corresponding screenshots. "
|
|
323
|
+
"Your goal is to perform the next action to complete the task. "
|
|
324
|
+
"If performing the same action multiple times results in a static screen with no changes, attempt a modified or alternative action.\n\n"
|
|
325
|
+
"## Function Definition\n\n"
|
|
326
|
+
"- You have access to the following functions:\n\n"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
_PROMPT_SUFFIX = (
|
|
330
|
+
"- To call a function, use the following structure without any suffix:\n\n"
|
|
331
|
+
"<gui_think> reasoning process </gui_think>\n"
|
|
332
|
+
"<seed:tool_call><function=example_function_name><parameter=example_parameter_1>value_1</parameter>"
|
|
333
|
+
"<parameter=example_parameter_2>multiline...\n</parameter></function></seed:tool_call>\n\n"
|
|
334
|
+
"## Important Notes\n"
|
|
335
|
+
"- Function calls must begin with <function= and end with </function>.\n"
|
|
336
|
+
"- All required parameters must be explicitly provided.\n"
|
|
337
|
+
"\n## Additional Notes\n"
|
|
338
|
+
"- You can execute multiple actions within a single tool call. For example:\n"
|
|
339
|
+
"<seed:tool_call><function=example_function_1><parameter=example_parameter_1>value_1</parameter><parameter=example_parameter_2>\n"
|
|
340
|
+
"This is the value for the second parameter\nthat can span\nmultiple lines\n"
|
|
341
|
+
"</parameter></function><function=example_function_2><parameter=example_parameter_3>value_4</parameter></function></seed:tool_call>"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
SYSTEM_PROMPT = _PROMPT_PREFIX + _format_tool_schemas_json_lines(TOOL_SCHEMAS) + _PROMPT_SUFFIX
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _extract_function_schemas_from_tools(
|
|
349
|
+
tools: Optional[List[Dict[str, Any]]],
|
|
350
|
+
) -> List[Dict[str, Any]]:
|
|
351
|
+
schemas: List[Dict[str, Any]] = []
|
|
352
|
+
if not tools:
|
|
353
|
+
return schemas
|
|
354
|
+
for t in tools:
|
|
355
|
+
if t.get("type") == "function":
|
|
356
|
+
fn = t.get("function", {})
|
|
357
|
+
name = fn.get("name")
|
|
358
|
+
params = fn.get("parameters", {})
|
|
359
|
+
desc = fn.get("description", "")
|
|
360
|
+
if name:
|
|
361
|
+
schemas.append(
|
|
362
|
+
{
|
|
363
|
+
"type": "function",
|
|
364
|
+
"name": name,
|
|
365
|
+
"parameters": params if isinstance(params, dict) else {},
|
|
366
|
+
"description": desc,
|
|
367
|
+
}
|
|
368
|
+
)
|
|
369
|
+
return schemas
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _parse_seed_tool_calls(text: str) -> List[Dict[str, Any]]:
|
|
373
|
+
"""Parse <seed:tool_call> blocks into a list of {function, parameters} dicts.
|
|
374
|
+
Also captures optional <gui_think>...</gui_think> as reasoning.
|
|
375
|
+
"""
|
|
376
|
+
actions: List[Dict[str, Any]] = []
|
|
377
|
+
if not text:
|
|
378
|
+
return actions
|
|
379
|
+
|
|
380
|
+
# Extract reasoning if present
|
|
381
|
+
reasoning_text = None
|
|
382
|
+
think_match = re.search(r"<gui_think>([\s\S]*?)</gui_think>", text)
|
|
383
|
+
if think_match:
|
|
384
|
+
reasoning_text = think_match.group(1).strip()
|
|
385
|
+
|
|
386
|
+
# Iterate each seed tool_call block
|
|
387
|
+
for block in re.finditer(r"<seed:tool_call>([\s\S]*?)</seed:tool_call>", text):
|
|
388
|
+
content = block.group(1)
|
|
389
|
+
# One or multiple <function=...>...</function> inside
|
|
390
|
+
for fmatch in re.finditer(r"<function=([\w_]+)>([\s\S]*?)</function>", content):
|
|
391
|
+
fname = fmatch.group(1)
|
|
392
|
+
inner = fmatch.group(2)
|
|
393
|
+
params: Dict[str, str] = {}
|
|
394
|
+
for pmatch in re.finditer(r"<parameter=([\w_]+)>([\s\S]*?)</parameter>", inner):
|
|
395
|
+
pname = pmatch.group(1)
|
|
396
|
+
pval = pmatch.group(2).strip()
|
|
397
|
+
params[pname] = pval
|
|
398
|
+
actions.append({"function": fname, "parameters": params})
|
|
399
|
+
|
|
400
|
+
# If we have a global reasoning and at least one action, attach it to first
|
|
401
|
+
if reasoning_text and actions:
|
|
402
|
+
actions[0]["reasoning"] = reasoning_text
|
|
403
|
+
elif reasoning_text:
|
|
404
|
+
actions.append({"function": "reasoning", "parameters": {"content": reasoning_text}})
|
|
405
|
+
|
|
406
|
+
return actions
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _normalize_xy_to_uitars(x: int, y: int, width: int, height: int) -> Tuple[int, int]:
|
|
410
|
+
width = max(1, int(width))
|
|
411
|
+
height = max(1, int(height))
|
|
412
|
+
nx = max(0, min(1000, int(round((x / width) * 1000))))
|
|
413
|
+
ny = max(0, min(1000, int(round((y / height) * 1000))))
|
|
414
|
+
return nx, ny
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _denormalize_xy_from_uitars(nx: float, ny: float, width: int, height: int) -> Tuple[int, int]:
|
|
418
|
+
width = max(1, int(width))
|
|
419
|
+
height = max(1, int(height))
|
|
420
|
+
x = int(round((nx / 1000.0) * width))
|
|
421
|
+
y = int(round((ny / 1000.0) * height))
|
|
422
|
+
return x, y
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def _map_computer_action_to_function(
|
|
426
|
+
action: Dict[str, Any], width: int, height: int
|
|
427
|
+
) -> Optional[Dict[str, Any]]:
|
|
428
|
+
"""Map a computer action item to a UITARS function + parameters dict of strings.
|
|
429
|
+
Returns dict like {"function": name, "parameters": {..}} or None if unknown.
|
|
430
|
+
"""
|
|
431
|
+
atype = action.get("type") or action.get("action")
|
|
432
|
+
if atype == "click":
|
|
433
|
+
x, y = action.get("x"), action.get("y")
|
|
434
|
+
btn = action.get("button", "left")
|
|
435
|
+
if x is None or y is None:
|
|
436
|
+
return None
|
|
437
|
+
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
|
438
|
+
if btn == "right":
|
|
439
|
+
return {
|
|
440
|
+
"function": "right_single",
|
|
441
|
+
"parameters": {"point": f"<point>{nx} {ny}</point>"},
|
|
442
|
+
}
|
|
443
|
+
return {"function": "click", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
|
|
444
|
+
if atype == "double_click":
|
|
445
|
+
x, y = action.get("x"), action.get("y")
|
|
446
|
+
if x is None or y is None:
|
|
447
|
+
return None
|
|
448
|
+
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
|
449
|
+
return {"function": "left_double", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
|
|
450
|
+
if atype == "move":
|
|
451
|
+
x, y = action.get("x"), action.get("y")
|
|
452
|
+
if x is None or y is None:
|
|
453
|
+
return None
|
|
454
|
+
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
|
455
|
+
return {"function": "move_to", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
|
|
456
|
+
if atype == "keypress":
|
|
457
|
+
keys = action.get("keys", [])
|
|
458
|
+
if isinstance(keys, list) and keys:
|
|
459
|
+
if len(keys) == 1:
|
|
460
|
+
return {"function": "press", "parameters": {"key": keys[0]}}
|
|
461
|
+
else:
|
|
462
|
+
return {"function": "hotkey", "parameters": {"key": " ".join(keys)}}
|
|
463
|
+
return None
|
|
464
|
+
if atype == "type":
|
|
465
|
+
text = action.get("text", "")
|
|
466
|
+
return {"function": "type", "parameters": {"content": text}}
|
|
467
|
+
if atype == "scroll":
|
|
468
|
+
x, y = action.get("x", 512), action.get("y", 512)
|
|
469
|
+
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
|
470
|
+
sx, sy = action.get("scroll_x", 0), action.get("scroll_y", 0)
|
|
471
|
+
# Our parser used positive sy for up
|
|
472
|
+
direction = (
|
|
473
|
+
"up"
|
|
474
|
+
if sy and sy > 0
|
|
475
|
+
else (
|
|
476
|
+
"down"
|
|
477
|
+
if sy and sy < 0
|
|
478
|
+
else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down"))
|
|
479
|
+
)
|
|
480
|
+
)
|
|
481
|
+
return {
|
|
482
|
+
"function": "scroll",
|
|
483
|
+
"parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"},
|
|
484
|
+
}
|
|
485
|
+
if atype == "drag":
|
|
486
|
+
path = action.get("path", [])
|
|
487
|
+
if isinstance(path, list) and len(path) >= 2:
|
|
488
|
+
sx, sy = path[0].get("x"), path[0].get("y")
|
|
489
|
+
ex, ey = path[-1].get("x"), path[-1].get("y")
|
|
490
|
+
if sx is None or sy is None or ex is None or ey is None:
|
|
491
|
+
return None
|
|
492
|
+
nsx, nsy = _normalize_xy_to_uitars(int(sx), int(sy), width, height)
|
|
493
|
+
nex, ney = _normalize_xy_to_uitars(int(ex), int(ey), width, height)
|
|
494
|
+
return {
|
|
495
|
+
"function": "drag",
|
|
496
|
+
"parameters": {
|
|
497
|
+
"start_point": f"<point>{nsx} {nsy}</point>",
|
|
498
|
+
"end_point": f"<point>{nex} {ney}</point>",
|
|
499
|
+
},
|
|
500
|
+
}
|
|
501
|
+
return None
|
|
502
|
+
if atype == "wait":
|
|
503
|
+
return {"function": "wait", "parameters": {}}
|
|
504
|
+
if atype == "screenshot":
|
|
505
|
+
return {"function": "take_screenshot", "parameters": {}}
|
|
506
|
+
# Fallback unknown
|
|
507
|
+
return None
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def _to_uitars_messages(
|
|
511
|
+
messages: List[Dict[str, Any]], width: int, height: int
|
|
512
|
+
) -> List[Dict[str, Any]]:
|
|
513
|
+
"""Convert responses items into completion messages tailored for UI-TARS.
|
|
514
|
+
|
|
515
|
+
- User content is passed through similar to convert_responses_items_to_completion_messages
|
|
516
|
+
- Assistant/tool history is rendered as text with <gui_think> and <seed:tool_call> blocks
|
|
517
|
+
"""
|
|
518
|
+
uitars_messages: List[Dict[str, Any]] = []
|
|
519
|
+
|
|
520
|
+
def flush_seed_block(pending_think: Optional[str], pending_functions: List[Dict[str, Any]]):
|
|
521
|
+
if not pending_think and not pending_functions:
|
|
522
|
+
return
|
|
523
|
+
parts: List[str] = []
|
|
524
|
+
if pending_think:
|
|
525
|
+
parts.append(f"<gui_think> {pending_think} </gui_think>")
|
|
526
|
+
if pending_functions:
|
|
527
|
+
inner = []
|
|
528
|
+
for f in pending_functions:
|
|
529
|
+
fname = f["function"]
|
|
530
|
+
params = f.get("parameters", {})
|
|
531
|
+
param_blocks = []
|
|
532
|
+
for k, v in params.items():
|
|
533
|
+
param_blocks.append(f"<parameter={k}>{v}</parameter>")
|
|
534
|
+
inner.append(f"<function={fname}>{''.join(param_blocks)}</function>")
|
|
535
|
+
parts.append(f"<seed:tool_call>{''.join(inner)}</seed:tool_call>")
|
|
536
|
+
uitars_messages.append({"role": "assistant", "content": "".join(parts)})
|
|
537
|
+
|
|
538
|
+
# Accumulators for a single assistant seed block
|
|
539
|
+
pending_think: Optional[str] = None
|
|
540
|
+
pending_functions: List[Dict[str, Any]] = []
|
|
541
|
+
|
|
542
|
+
for msg in messages:
|
|
543
|
+
mtype = msg.get("type")
|
|
544
|
+
role = msg.get("role")
|
|
545
|
+
|
|
546
|
+
# On any user message, flush current assistant block
|
|
547
|
+
if role == "user" or mtype == "user":
|
|
548
|
+
flush_seed_block(pending_think, pending_functions)
|
|
549
|
+
pending_think, pending_functions = None, []
|
|
550
|
+
|
|
551
|
+
content = msg.get("content", "")
|
|
552
|
+
if isinstance(content, list):
|
|
553
|
+
completion_content = []
|
|
554
|
+
for item in content:
|
|
555
|
+
if item.get("type") == "input_image":
|
|
556
|
+
completion_content.append(
|
|
557
|
+
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
|
|
558
|
+
)
|
|
559
|
+
elif item.get("type") in ("input_text", "text"):
|
|
560
|
+
completion_content.append({"type": "text", "text": item.get("text")})
|
|
561
|
+
uitars_messages.append({"role": "user", "content": completion_content})
|
|
562
|
+
elif isinstance(content, str):
|
|
563
|
+
uitars_messages.append({"role": "user", "content": content})
|
|
564
|
+
continue
|
|
565
|
+
|
|
566
|
+
# Reasoning item
|
|
567
|
+
if mtype == "reasoning":
|
|
568
|
+
# Responses reasoning stores summary list
|
|
569
|
+
summary = msg.get("summary", [])
|
|
570
|
+
texts = [
|
|
571
|
+
s.get("text", "")
|
|
572
|
+
for s in summary
|
|
573
|
+
if isinstance(s, dict) and s.get("type") == "summary_text"
|
|
574
|
+
]
|
|
575
|
+
if texts:
|
|
576
|
+
pending_think = "\n".join([t for t in texts if t])
|
|
577
|
+
continue
|
|
578
|
+
|
|
579
|
+
# Computer/tool calls -> map to functions
|
|
580
|
+
if mtype == "computer_call":
|
|
581
|
+
f = _map_computer_action_to_function(msg.get("action", {}), width, height)
|
|
582
|
+
if f:
|
|
583
|
+
pending_functions.append(f)
|
|
584
|
+
continue
|
|
585
|
+
if mtype == "function_call":
|
|
586
|
+
# Include custom tools as-is
|
|
587
|
+
name = msg.get("name")
|
|
588
|
+
try:
|
|
589
|
+
args_obj = json.loads(msg.get("arguments", "{}"))
|
|
590
|
+
except json.JSONDecodeError:
|
|
591
|
+
args_obj = {}
|
|
592
|
+
# Ensure string values
|
|
593
|
+
params = {k: (str(v) if not isinstance(v, str) else v) for k, v in args_obj.items()}
|
|
594
|
+
pending_functions.append({"function": name, "parameters": params})
|
|
595
|
+
continue
|
|
596
|
+
|
|
597
|
+
# If assistant message text is given, flush current block and add as plain assistant text
|
|
598
|
+
if role == "assistant" or mtype == "message":
|
|
599
|
+
flush_seed_block(pending_think, pending_functions)
|
|
600
|
+
pending_think, pending_functions = None, []
|
|
601
|
+
content = msg.get("content", [])
|
|
602
|
+
if isinstance(content, list):
|
|
603
|
+
texts = [
|
|
604
|
+
c.get("text", "")
|
|
605
|
+
for c in content
|
|
606
|
+
if isinstance(c, dict) and c.get("type") in ("output_text", "text")
|
|
607
|
+
]
|
|
608
|
+
if texts:
|
|
609
|
+
uitars_messages.append(
|
|
610
|
+
{"role": "assistant", "content": "\n".join([t for t in texts if t])}
|
|
611
|
+
)
|
|
612
|
+
elif isinstance(content, str) and content:
|
|
613
|
+
uitars_messages.append({"role": "assistant", "content": content})
|
|
614
|
+
continue
|
|
615
|
+
|
|
616
|
+
# On outputs, flush pending assistant block and send outputs as user messages
|
|
617
|
+
if mtype in ("function_call_output", "computer_call_output"):
|
|
618
|
+
flush_seed_block(pending_think, pending_functions)
|
|
619
|
+
pending_think, pending_functions = None, []
|
|
620
|
+
output = msg.get("output")
|
|
621
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
622
|
+
img_url = output.get("image_url")
|
|
623
|
+
if img_url:
|
|
624
|
+
uitars_messages.append(
|
|
625
|
+
{
|
|
626
|
+
"role": "user",
|
|
627
|
+
"content": [
|
|
628
|
+
{"type": "image_url", "image_url": {"url": img_url}},
|
|
629
|
+
],
|
|
630
|
+
}
|
|
631
|
+
)
|
|
632
|
+
elif isinstance(output, str):
|
|
633
|
+
uitars_messages.append({"role": "user", "content": output})
|
|
634
|
+
else:
|
|
635
|
+
# Fallback stringify
|
|
636
|
+
uitars_messages.append({"role": "user", "content": json.dumps(output)})
|
|
637
|
+
continue
|
|
638
|
+
|
|
639
|
+
# Flush any remaining pending seed block
|
|
640
|
+
flush_seed_block(pending_think, pending_functions)
|
|
641
|
+
|
|
642
|
+
return uitars_messages
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def _to_response_items(
|
|
646
|
+
actions: List[Dict[str, Any]],
|
|
647
|
+
tool_names: Optional[set[str]] = None,
|
|
648
|
+
width: Optional[int] = None,
|
|
649
|
+
height: Optional[int] = None,
|
|
650
|
+
) -> List[Any]:
|
|
651
|
+
"""Map parsed actions into Responses items (computer actions + optional reasoning)."""
|
|
652
|
+
items: List[Any] = []
|
|
653
|
+
tool_names = tool_names or set()
|
|
654
|
+
|
|
655
|
+
# Optional top-level reasoning attached to first
|
|
656
|
+
if actions and actions[0].get("reasoning"):
|
|
657
|
+
items.append(make_reasoning_item(actions[0]["reasoning"]))
|
|
658
|
+
|
|
659
|
+
# Dimensions default
|
|
660
|
+
w = int(width) if width else 1024
|
|
661
|
+
h = int(height) if height else 768
|
|
662
|
+
|
|
663
|
+
for a in actions:
|
|
664
|
+
fn = a.get("function")
|
|
665
|
+
params = a.get("parameters", {})
|
|
666
|
+
if fn == "reasoning":
|
|
667
|
+
items.append(make_reasoning_item(params.get("content", "")))
|
|
668
|
+
elif fn in ("click", "left_double", "right_single"):
|
|
669
|
+
# params.point is like: <point>x y</point> or plain "x y"
|
|
670
|
+
point = params.get("point", "").strip()
|
|
671
|
+
m = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", point)
|
|
672
|
+
if not m:
|
|
673
|
+
continue
|
|
674
|
+
nx = float(m.group(1))
|
|
675
|
+
ny = float(m.group(2))
|
|
676
|
+
x, y = _denormalize_xy_from_uitars(nx, ny, w, h)
|
|
677
|
+
if fn == "left_double":
|
|
678
|
+
items.append(make_double_click_item(x, y))
|
|
679
|
+
elif fn == "right_single":
|
|
680
|
+
items.append(make_click_item(x, y, "right"))
|
|
681
|
+
else:
|
|
682
|
+
items.append(make_click_item(x, y, "left"))
|
|
683
|
+
elif fn == "move_to":
|
|
684
|
+
point = params.get("point", "").strip()
|
|
685
|
+
m = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", point)
|
|
686
|
+
if not m:
|
|
687
|
+
continue
|
|
688
|
+
nx = float(m.group(1))
|
|
689
|
+
ny = float(m.group(2))
|
|
690
|
+
x, y = _denormalize_xy_from_uitars(nx, ny, w, h)
|
|
691
|
+
items.append(make_move_item(x, y))
|
|
692
|
+
elif fn == "drag":
|
|
693
|
+
sp = params.get("start_point", "").strip()
|
|
694
|
+
ep = params.get("end_point", "").strip()
|
|
695
|
+
ms = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", sp)
|
|
696
|
+
me = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", ep)
|
|
697
|
+
if not (ms and me):
|
|
698
|
+
continue
|
|
699
|
+
nsx, nsy = float(ms.group(1)), float(ms.group(2))
|
|
700
|
+
nex, ney = float(me.group(1)), float(me.group(2))
|
|
701
|
+
sx, sy = _denormalize_xy_from_uitars(nsx, nsy, w, h)
|
|
702
|
+
ex, ey = _denormalize_xy_from_uitars(nex, ney, w, h)
|
|
703
|
+
items.append(make_drag_item([{"x": sx, "y": sy}, {"x": ex, "y": ey}]))
|
|
704
|
+
elif fn == "hotkey":
|
|
705
|
+
key = params.get("key", "")
|
|
706
|
+
keys = key.split()
|
|
707
|
+
if keys:
|
|
708
|
+
items.append(make_keypress_item(keys))
|
|
709
|
+
elif fn == "press":
|
|
710
|
+
key = params.get("key", "")
|
|
711
|
+
if key:
|
|
712
|
+
items.append(make_keypress_item([key]))
|
|
713
|
+
elif fn == "type":
|
|
714
|
+
content = params.get("content", "")
|
|
715
|
+
items.append(make_type_item(content))
|
|
716
|
+
elif fn == "scroll":
|
|
717
|
+
# direction: up/down/left/right. Point optional
|
|
718
|
+
direction = params.get("direction", "down").lower()
|
|
719
|
+
point = params.get("point", "")
|
|
720
|
+
m = re.search(r"([\-\d\.]+)\s+([\-\d\.]+)", point)
|
|
721
|
+
if m:
|
|
722
|
+
nx = float(m.group(1))
|
|
723
|
+
ny = float(m.group(2))
|
|
724
|
+
x, y = _denormalize_xy_from_uitars(nx, ny, w, h)
|
|
725
|
+
else:
|
|
726
|
+
x, y = _denormalize_xy_from_uitars(500.0, 500.0, w, h)
|
|
727
|
+
dy = 5 if direction == "up" else -5
|
|
728
|
+
dx = 5 if direction == "right" else (-5 if direction == "left" else 0)
|
|
729
|
+
items.append(make_scroll_item(x, y, dx, dy))
|
|
730
|
+
elif fn == "wait":
|
|
731
|
+
items.append(make_wait_item())
|
|
732
|
+
elif fn == "finished":
|
|
733
|
+
content = params.get("content", "")
|
|
734
|
+
items.append(make_output_text_item(content or "Task completed."))
|
|
735
|
+
break
|
|
736
|
+
elif fn == "take_screenshot":
|
|
737
|
+
items.append(make_screenshot_item())
|
|
738
|
+
elif fn == "open_computer":
|
|
739
|
+
items.append(make_screenshot_item())
|
|
740
|
+
else:
|
|
741
|
+
# If this function name is present in provided tool schemas, emit function_call
|
|
742
|
+
if fn in tool_names:
|
|
743
|
+
# Convert simple string params into an arguments object
|
|
744
|
+
# Parameters are strings; pass through as-is
|
|
745
|
+
items.append(make_function_call_item(fn, params))
|
|
746
|
+
else:
|
|
747
|
+
# Unknown function -> surface as assistant text
|
|
748
|
+
items.append(make_output_text_item(f"Unknown action: {fn} {params}"))
|
|
749
|
+
|
|
750
|
+
return items
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
@register_agent(models=r"(?i).*ui-?tars-?2.*")
|
|
754
|
+
class UITARS2Config:
|
|
755
|
+
async def predict_step(
|
|
756
|
+
self,
|
|
757
|
+
messages: List[Dict[str, Any]],
|
|
758
|
+
model: str,
|
|
759
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
760
|
+
max_retries: Optional[int] = None,
|
|
761
|
+
stream: bool = False,
|
|
762
|
+
computer_handler=None,
|
|
763
|
+
use_prompt_caching: Optional[bool] = False,
|
|
764
|
+
_on_api_start=None,
|
|
765
|
+
_on_api_end=None,
|
|
766
|
+
_on_usage=None,
|
|
767
|
+
_on_screenshot=None,
|
|
768
|
+
**kwargs,
|
|
769
|
+
) -> Dict[str, Any]:
|
|
770
|
+
# Determine screen dimensions (prefer computer_handler, fallback to last screenshot)
|
|
771
|
+
width: Optional[int] = None
|
|
772
|
+
height: Optional[int] = None
|
|
773
|
+
if computer_handler is not None and hasattr(computer_handler, "get_dimensions"):
|
|
774
|
+
try:
|
|
775
|
+
dims = await computer_handler.get_dimensions() # type: ignore
|
|
776
|
+
if isinstance(dims, (list, tuple)) and len(dims) == 2:
|
|
777
|
+
width, height = int(dims[0]), int(dims[1])
|
|
778
|
+
except Exception:
|
|
779
|
+
pass
|
|
780
|
+
|
|
781
|
+
if width is None or height is None:
|
|
782
|
+
try:
|
|
783
|
+
last_out = get_last_computer_call_output(messages) # type: ignore
|
|
784
|
+
if last_out:
|
|
785
|
+
image_url = last_out.get("output", {}).get("image_url", "")
|
|
786
|
+
if image_url:
|
|
787
|
+
b64 = image_url.split(",")[-1]
|
|
788
|
+
img_bytes = base64.b64decode(b64)
|
|
789
|
+
if Image is not None:
|
|
790
|
+
img = Image.open(io.BytesIO(img_bytes))
|
|
791
|
+
width, height = img.size
|
|
792
|
+
except Exception:
|
|
793
|
+
pass
|
|
794
|
+
|
|
795
|
+
if width is None or height is None:
|
|
796
|
+
width, height = 1024, 768
|
|
797
|
+
|
|
798
|
+
# Convert Responses items to UI-TARS style messages with <seed:tool_call> history
|
|
799
|
+
completion_messages = _to_uitars_messages(messages, width, height)
|
|
800
|
+
|
|
801
|
+
# Build dynamic system prompt by concatenating built-in schemas and provided function tools
|
|
802
|
+
provided_fn_schemas = _extract_function_schemas_from_tools(tools)
|
|
803
|
+
combined_schemas = (
|
|
804
|
+
TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
|
|
805
|
+
)
|
|
806
|
+
dynamic_system_prompt = (
|
|
807
|
+
_PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
# Prepend system prompt (based on training prompts + provided tools)
|
|
811
|
+
litellm_messages: List[Dict[str, Any]] = [
|
|
812
|
+
{"role": "system", "content": dynamic_system_prompt},
|
|
813
|
+
]
|
|
814
|
+
litellm_messages.extend(completion_messages)
|
|
815
|
+
|
|
816
|
+
api_kwargs: Dict[str, Any] = {
|
|
817
|
+
"model": model,
|
|
818
|
+
"messages": litellm_messages,
|
|
819
|
+
"max_retries": max_retries,
|
|
820
|
+
"stream": stream,
|
|
821
|
+
**{k: v for k, v in kwargs.items()},
|
|
822
|
+
}
|
|
823
|
+
if use_prompt_caching:
|
|
824
|
+
api_kwargs["use_prompt_caching"] = use_prompt_caching
|
|
825
|
+
|
|
826
|
+
if _on_api_start:
|
|
827
|
+
await _on_api_start(api_kwargs)
|
|
828
|
+
|
|
829
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
830
|
+
|
|
831
|
+
if _on_api_end:
|
|
832
|
+
await _on_api_end(api_kwargs, response)
|
|
833
|
+
|
|
834
|
+
usage = {
|
|
835
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage( # type: ignore
|
|
836
|
+
response.usage
|
|
837
|
+
).model_dump(),
|
|
838
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
839
|
+
}
|
|
840
|
+
if _on_usage:
|
|
841
|
+
await _on_usage(usage)
|
|
842
|
+
|
|
843
|
+
# Extract text content (first choice)
|
|
844
|
+
response_dict = response.model_dump() # type: ignore
|
|
845
|
+
content_text = ""
|
|
846
|
+
choices = response_dict.get("choices", [])
|
|
847
|
+
if choices:
|
|
848
|
+
msg = choices[0].get("message", {})
|
|
849
|
+
# message.content may be string or array; gather text pieces
|
|
850
|
+
mc = msg.get("content")
|
|
851
|
+
if isinstance(mc, str):
|
|
852
|
+
content_text = mc
|
|
853
|
+
elif isinstance(mc, list):
|
|
854
|
+
parts = []
|
|
855
|
+
for part in mc:
|
|
856
|
+
if isinstance(part, dict) and part.get("type") == "text":
|
|
857
|
+
parts.append(part.get("text", ""))
|
|
858
|
+
content_text = "\n".join([p for p in parts if p])
|
|
859
|
+
|
|
860
|
+
# Parse the seed tool calls and map to response items
|
|
861
|
+
actions = _parse_seed_tool_calls(content_text)
|
|
862
|
+
# Build set of tool names from provided tools to emit function_call items
|
|
863
|
+
tool_names: set[str] = set()
|
|
864
|
+
for s in provided_fn_schemas:
|
|
865
|
+
name = s.get("name")
|
|
866
|
+
if isinstance(name, str):
|
|
867
|
+
tool_names.add(name)
|
|
868
|
+
output_items = _to_response_items(actions, tool_names, width, height)
|
|
869
|
+
|
|
870
|
+
return {"output": output_items, "usage": usage}
|
|
871
|
+
|
|
872
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
873
|
+
return ["step"]
|
|
874
|
+
|
|
875
|
+
async def predict_click(
|
|
876
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
877
|
+
) -> Optional[Tuple[int, int]]:
|
|
878
|
+
"""Predict a single click coordinate using a minimal prompt with a click tool.
|
|
879
|
+
|
|
880
|
+
This sends the current screenshot and instruction, asking the model to
|
|
881
|
+
output a click action in the form:
|
|
882
|
+
Action: click(point='(x,y)')
|
|
883
|
+
"""
|
|
884
|
+
# Minimal grounding-style prompt
|
|
885
|
+
system_text = (
|
|
886
|
+
"You are a GUI agent. Given the instruction, return a single action on the current screen.\n\n"
|
|
887
|
+
"## Output Format\n\n"
|
|
888
|
+
"Action: click(point='(x,y)')\n\n"
|
|
889
|
+
"## User Instruction\n"
|
|
890
|
+
f"{instruction}"
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
# Build messages with image
|
|
894
|
+
litellm_messages: List[Dict[str, Any]] = [
|
|
895
|
+
{"role": "system", "content": system_text},
|
|
896
|
+
{
|
|
897
|
+
"role": "user",
|
|
898
|
+
"content": [
|
|
899
|
+
{"type": "text", "text": "Please return a single click action."},
|
|
900
|
+
{
|
|
901
|
+
"type": "image_url",
|
|
902
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
903
|
+
},
|
|
904
|
+
],
|
|
905
|
+
},
|
|
906
|
+
]
|
|
907
|
+
|
|
908
|
+
api_kwargs: Dict[str, Any] = {
|
|
909
|
+
"model": model,
|
|
910
|
+
"messages": litellm_messages,
|
|
911
|
+
"max_tokens": kwargs.get("max_tokens", 512),
|
|
912
|
+
"temperature": kwargs.get("temperature", 0.0),
|
|
913
|
+
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
|
914
|
+
}
|
|
915
|
+
api_kwargs.update(
|
|
916
|
+
{k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]}
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
920
|
+
# Extract response content
|
|
921
|
+
response_dict = response.model_dump() # type: ignore
|
|
922
|
+
choices = response_dict.get("choices", [])
|
|
923
|
+
if not choices:
|
|
924
|
+
return None
|
|
925
|
+
msg = choices[0].get("message", {})
|
|
926
|
+
content_text = msg.get("content", "")
|
|
927
|
+
if isinstance(content_text, list):
|
|
928
|
+
text_parts = [
|
|
929
|
+
p.get("text", "")
|
|
930
|
+
for p in content_text
|
|
931
|
+
if isinstance(p, dict) and p.get("type") == "text"
|
|
932
|
+
]
|
|
933
|
+
content_text = "\n".join([t for t in text_parts if t])
|
|
934
|
+
if not isinstance(content_text, str):
|
|
935
|
+
return None
|
|
936
|
+
|
|
937
|
+
# Parse coordinates
|
|
938
|
+
# Pattern for click(point='(x,y)') or click(start_box='(x,y)')
|
|
939
|
+
patterns = [
|
|
940
|
+
r"click\(point='\((\d+),(\d+)\)'\)",
|
|
941
|
+
r"click\((?:start_box|point)='\((\d+),(\d+)\)'\)",
|
|
942
|
+
]
|
|
943
|
+
for pat in patterns:
|
|
944
|
+
m = re.search(pat, content_text)
|
|
945
|
+
if m:
|
|
946
|
+
try:
|
|
947
|
+
x, y = int(m.group(1)), int(m.group(2))
|
|
948
|
+
return (x, y)
|
|
949
|
+
except Exception:
|
|
950
|
+
pass
|
|
951
|
+
return None
|