cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
"""FARA VLM agent configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
import litellm
|
|
10
|
+
from litellm.responses.litellm_completion_transformation.transformation import (
|
|
11
|
+
LiteLLMCompletionResponsesConfig,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from ...decorators import register_agent
|
|
15
|
+
from ...loops.base import AsyncAgentConfig
|
|
16
|
+
from ...responses import (
|
|
17
|
+
convert_completion_messages_to_responses_items,
|
|
18
|
+
convert_responses_items_to_completion_messages,
|
|
19
|
+
make_reasoning_item,
|
|
20
|
+
)
|
|
21
|
+
from ...types import AgentCapability
|
|
22
|
+
from .helpers import (
|
|
23
|
+
build_nous_system,
|
|
24
|
+
convert_qwen_tool_args_to_computer_action,
|
|
25
|
+
parse_tool_call_from_text,
|
|
26
|
+
unnormalize_coordinate,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@register_agent(models=r"(?i).*fara-7b.*")
|
|
31
|
+
class FaraVlmConfig(AsyncAgentConfig):
|
|
32
|
+
async def predict_step(
|
|
33
|
+
self,
|
|
34
|
+
messages: List[Dict[str, Any]],
|
|
35
|
+
model: str,
|
|
36
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
37
|
+
max_retries: Optional[int] = None,
|
|
38
|
+
stream: bool = False,
|
|
39
|
+
computer_handler=None,
|
|
40
|
+
use_prompt_caching: Optional[bool] = False,
|
|
41
|
+
_on_api_start=None,
|
|
42
|
+
_on_api_end=None,
|
|
43
|
+
_on_usage=None,
|
|
44
|
+
_on_screenshot=None,
|
|
45
|
+
**kwargs,
|
|
46
|
+
) -> Dict[str, Any]:
|
|
47
|
+
# Check if the last message is a terminate function_call_output
|
|
48
|
+
# If so, return a final assistant message to stop the loop
|
|
49
|
+
if messages:
|
|
50
|
+
last_msg = messages[-1]
|
|
51
|
+
if last_msg.get("type") in ("function_call_output", "computer_call_output"):
|
|
52
|
+
output_data = last_msg.get("output")
|
|
53
|
+
|
|
54
|
+
# Parse string if needed (could be JSON or Python dict literal)
|
|
55
|
+
if isinstance(output_data, str):
|
|
56
|
+
try:
|
|
57
|
+
output_data = json.loads(output_data)
|
|
58
|
+
except:
|
|
59
|
+
try:
|
|
60
|
+
output_data = ast.literal_eval(output_data)
|
|
61
|
+
except:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
# Check if it's a terminate action output (contains "terminated": True)
|
|
65
|
+
if isinstance(output_data, dict) and output_data.get("terminated") is True:
|
|
66
|
+
return {
|
|
67
|
+
"output": [
|
|
68
|
+
{
|
|
69
|
+
"type": "message",
|
|
70
|
+
"role": "assistant",
|
|
71
|
+
"content": [{"type": "output_text", "text": "Task completed."}],
|
|
72
|
+
}
|
|
73
|
+
],
|
|
74
|
+
"usage": {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0},
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Build messages using NousFnCallPrompt system with tool schema in text
|
|
78
|
+
# Start with converted conversation (images/text preserved)
|
|
79
|
+
converted_msgs = convert_responses_items_to_completion_messages(
|
|
80
|
+
messages, allow_images_in_tool_results=False, use_xml_tools=True
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Build function schemas from tools array
|
|
84
|
+
function_schemas = []
|
|
85
|
+
if tools:
|
|
86
|
+
from ...computers import is_agent_computer
|
|
87
|
+
|
|
88
|
+
for tool in tools:
|
|
89
|
+
tool_type = tool.get("type")
|
|
90
|
+
|
|
91
|
+
if tool_type == "computer":
|
|
92
|
+
# For computer tools, use QWEN3_COMPUTER_TOOL schema
|
|
93
|
+
computer = tool.get("computer")
|
|
94
|
+
if computer and is_agent_computer(computer):
|
|
95
|
+
function_schemas.append(QWEN3_COMPUTER_TOOL["function"])
|
|
96
|
+
elif tool_type == "function":
|
|
97
|
+
# For function tools, use the provided function schema
|
|
98
|
+
function_schema = tool.get("function")
|
|
99
|
+
if function_schema:
|
|
100
|
+
function_schemas.append(function_schema)
|
|
101
|
+
|
|
102
|
+
# If no tools provided or no computer tool found, use default QWEN3_COMPUTER_TOOL
|
|
103
|
+
if not function_schemas:
|
|
104
|
+
function_schemas = [QWEN3_COMPUTER_TOOL["function"]]
|
|
105
|
+
|
|
106
|
+
# Prepend Nous-generated system if available
|
|
107
|
+
nous_system = build_nous_system(function_schemas)
|
|
108
|
+
completion_messages = ([nous_system] if nous_system else []) + converted_msgs
|
|
109
|
+
|
|
110
|
+
# If there is no screenshot in the conversation, take one now and inject it.
|
|
111
|
+
def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
|
|
112
|
+
for m in msgs:
|
|
113
|
+
content = m.get("content")
|
|
114
|
+
if isinstance(content, list):
|
|
115
|
+
for p in content:
|
|
116
|
+
if isinstance(p, dict) and p.get("type") == "image_url":
|
|
117
|
+
return True
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
pre_output_items: List[Dict[str, Any]] = []
|
|
121
|
+
if not _has_any_image(completion_messages):
|
|
122
|
+
if computer_handler is None or not hasattr(computer_handler, "screenshot"):
|
|
123
|
+
raise RuntimeError(
|
|
124
|
+
"No screenshots present and computer_handler.screenshot is not available."
|
|
125
|
+
)
|
|
126
|
+
screenshot_b64 = await computer_handler.screenshot()
|
|
127
|
+
if not screenshot_b64:
|
|
128
|
+
raise RuntimeError("Failed to capture screenshot from computer_handler.")
|
|
129
|
+
|
|
130
|
+
await _on_screenshot(screenshot_b64, "screenshot_before")
|
|
131
|
+
|
|
132
|
+
# Check if computer_handler has get_current_url method
|
|
133
|
+
screenshot_text = "Here is the next screenshot. Think about what to do next."
|
|
134
|
+
if hasattr(computer_handler, "get_current_url"):
|
|
135
|
+
try:
|
|
136
|
+
current_url = await computer_handler.get_current_url()
|
|
137
|
+
screenshot_text = f"Current URL: {current_url[:100]}\nHere is the next screenshot. Think about what to do next."
|
|
138
|
+
except Exception:
|
|
139
|
+
# If get_current_url fails, fall back to default text
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
# Inject a user message with the screenshot so the model can see current context
|
|
143
|
+
screenshot_msg = {
|
|
144
|
+
"role": "user",
|
|
145
|
+
"content": [
|
|
146
|
+
{
|
|
147
|
+
"type": "image_url",
|
|
148
|
+
"image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
|
|
149
|
+
},
|
|
150
|
+
{"type": "text", "text": screenshot_text},
|
|
151
|
+
],
|
|
152
|
+
}
|
|
153
|
+
completion_messages.append(screenshot_msg)
|
|
154
|
+
|
|
155
|
+
# Smart-resize all screenshots and attach min/max pixel hints. Fail fast if deps missing.
|
|
156
|
+
# Also record the last resized width/height to unnormalize coordinates later.
|
|
157
|
+
last_rw: Optional[int] = None
|
|
158
|
+
last_rh: Optional[int] = None
|
|
159
|
+
MIN_PIXELS = 3136
|
|
160
|
+
MAX_PIXELS = 12845056
|
|
161
|
+
try:
|
|
162
|
+
import base64
|
|
163
|
+
import io
|
|
164
|
+
|
|
165
|
+
from PIL import Image # type: ignore
|
|
166
|
+
from qwen_vl_utils import smart_resize # type: ignore
|
|
167
|
+
except Exception:
|
|
168
|
+
raise ImportError(
|
|
169
|
+
"qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
for msg in completion_messages:
|
|
173
|
+
content = msg.get("content")
|
|
174
|
+
if not isinstance(content, list):
|
|
175
|
+
continue
|
|
176
|
+
for part in content:
|
|
177
|
+
if isinstance(part, dict) and part.get("type") == "image_url":
|
|
178
|
+
url = ((part.get("image_url") or {}).get("url")) or ""
|
|
179
|
+
# Expect data URL like data:image/png;base64,<b64>
|
|
180
|
+
if url.startswith("data:") and "," in url:
|
|
181
|
+
b64 = url.split(",", 1)[1]
|
|
182
|
+
img_bytes = base64.b64decode(b64)
|
|
183
|
+
im = Image.open(io.BytesIO(img_bytes))
|
|
184
|
+
h, w = im.height, im.width
|
|
185
|
+
rh, rw = smart_resize(
|
|
186
|
+
h, w, factor=28, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
|
|
187
|
+
)
|
|
188
|
+
# Attach hints on this image block
|
|
189
|
+
part["min_pixels"] = MIN_PIXELS
|
|
190
|
+
part["max_pixels"] = MAX_PIXELS
|
|
191
|
+
last_rw, last_rh = rw, rh
|
|
192
|
+
|
|
193
|
+
api_kwargs: Dict[str, Any] = {
|
|
194
|
+
"model": model,
|
|
195
|
+
"messages": completion_messages,
|
|
196
|
+
"max_retries": max_retries,
|
|
197
|
+
"stream": stream,
|
|
198
|
+
**{k: v for k, v in kwargs.items()},
|
|
199
|
+
}
|
|
200
|
+
if use_prompt_caching:
|
|
201
|
+
api_kwargs["use_prompt_caching"] = use_prompt_caching
|
|
202
|
+
|
|
203
|
+
if _on_api_start:
|
|
204
|
+
await _on_api_start(api_kwargs)
|
|
205
|
+
|
|
206
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
207
|
+
|
|
208
|
+
if _on_api_end:
|
|
209
|
+
await _on_api_end(api_kwargs, response)
|
|
210
|
+
|
|
211
|
+
usage = {
|
|
212
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage( # type: ignore
|
|
213
|
+
response.usage
|
|
214
|
+
).model_dump(),
|
|
215
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
216
|
+
}
|
|
217
|
+
if _on_usage:
|
|
218
|
+
await _on_usage(usage)
|
|
219
|
+
|
|
220
|
+
# Extract response data
|
|
221
|
+
resp_dict = response.model_dump() # type: ignore
|
|
222
|
+
choice = (resp_dict.get("choices") or [{}])[0]
|
|
223
|
+
message = choice.get("message") or {}
|
|
224
|
+
content_text = message.get("content") or ""
|
|
225
|
+
tool_calls_array = message.get("tool_calls") or []
|
|
226
|
+
reasoning_text = message.get("reasoning") or ""
|
|
227
|
+
|
|
228
|
+
output_items: List[Dict[str, Any]] = []
|
|
229
|
+
|
|
230
|
+
# Add reasoning if present (Ollama Cloud format)
|
|
231
|
+
if reasoning_text:
|
|
232
|
+
output_items.append(make_reasoning_item(reasoning_text))
|
|
233
|
+
|
|
234
|
+
# Priority 1: Try to parse tool call from content text (OpenRouter format)
|
|
235
|
+
tool_call = parse_tool_call_from_text(content_text)
|
|
236
|
+
|
|
237
|
+
if tool_call and isinstance(tool_call, dict):
|
|
238
|
+
fn_name = tool_call.get("name") or "computer"
|
|
239
|
+
raw_args = tool_call.get("arguments") or {}
|
|
240
|
+
# Unnormalize coordinates to actual screen size using last resized dims
|
|
241
|
+
if last_rw is None or last_rh is None:
|
|
242
|
+
raise RuntimeError(
|
|
243
|
+
"No screenshots found to derive dimensions for coordinate unnormalization."
|
|
244
|
+
)
|
|
245
|
+
args = await unnormalize_coordinate(raw_args, (last_rw, last_rh))
|
|
246
|
+
|
|
247
|
+
# Extract thoughts (text before <tool_call> tag)
|
|
248
|
+
thoughts = ""
|
|
249
|
+
if "<tool_call>" in content_text:
|
|
250
|
+
thoughts = content_text.split("<tool_call>")[0].strip()
|
|
251
|
+
|
|
252
|
+
# Build an OpenAI-style tool call so we can reuse the converter
|
|
253
|
+
fake_cm = {
|
|
254
|
+
"role": "assistant",
|
|
255
|
+
"content": thoughts, # Preserve thoughts before tool call
|
|
256
|
+
"tool_calls": [
|
|
257
|
+
{
|
|
258
|
+
"type": "function",
|
|
259
|
+
"id": "call_0",
|
|
260
|
+
"function": {
|
|
261
|
+
"name": fn_name,
|
|
262
|
+
"arguments": json.dumps(args),
|
|
263
|
+
},
|
|
264
|
+
}
|
|
265
|
+
],
|
|
266
|
+
}
|
|
267
|
+
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
|
268
|
+
elif tool_calls_array:
|
|
269
|
+
# Priority 2: Use tool_calls field if present (Ollama Cloud format)
|
|
270
|
+
# Process and unnormalize coordinates in tool calls
|
|
271
|
+
processed_tool_calls = []
|
|
272
|
+
for tc in tool_calls_array:
|
|
273
|
+
function = tc.get("function", {})
|
|
274
|
+
fn_name = function.get("name", "computer")
|
|
275
|
+
args_str = function.get("arguments", "{}")
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
args = json.loads(args_str)
|
|
279
|
+
|
|
280
|
+
# Unnormalize coordinates if present
|
|
281
|
+
if "coordinate" in args and last_rw is not None and last_rh is not None:
|
|
282
|
+
args = await unnormalize_coordinate(args, (last_rw, last_rh))
|
|
283
|
+
|
|
284
|
+
# Convert Qwen format to Computer Calls format if this is a computer tool
|
|
285
|
+
if fn_name == "computer":
|
|
286
|
+
converted_action = convert_qwen_tool_args_to_computer_action(args)
|
|
287
|
+
if converted_action:
|
|
288
|
+
args = converted_action
|
|
289
|
+
|
|
290
|
+
processed_tool_calls.append(
|
|
291
|
+
{
|
|
292
|
+
"type": tc.get("type", "function"),
|
|
293
|
+
"id": tc.get("id", "call_0"),
|
|
294
|
+
"function": {
|
|
295
|
+
"name": fn_name,
|
|
296
|
+
"arguments": json.dumps(args),
|
|
297
|
+
},
|
|
298
|
+
}
|
|
299
|
+
)
|
|
300
|
+
except json.JSONDecodeError:
|
|
301
|
+
# Keep original if parsing fails
|
|
302
|
+
processed_tool_calls.append(tc)
|
|
303
|
+
|
|
304
|
+
fake_cm = {
|
|
305
|
+
"role": "assistant",
|
|
306
|
+
"content": content_text if content_text else "",
|
|
307
|
+
"tool_calls": processed_tool_calls,
|
|
308
|
+
}
|
|
309
|
+
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
|
310
|
+
else:
|
|
311
|
+
# No tool calls found in either format, return text response
|
|
312
|
+
fake_cm = {"role": "assistant", "content": content_text}
|
|
313
|
+
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
|
314
|
+
|
|
315
|
+
# Check if this is a terminate action - if so, add a final assistant message to stop the loop
|
|
316
|
+
has_terminate = False
|
|
317
|
+
for item in output_items:
|
|
318
|
+
if item.get("type") == "computer_call":
|
|
319
|
+
action = item.get("action", {})
|
|
320
|
+
if action.get("type") == "terminate":
|
|
321
|
+
has_terminate = True
|
|
322
|
+
break
|
|
323
|
+
elif item.get("type") == "function_call":
|
|
324
|
+
try:
|
|
325
|
+
args = json.loads(item.get("arguments", "{}"))
|
|
326
|
+
if args.get("action") == "terminate":
|
|
327
|
+
has_terminate = True
|
|
328
|
+
break
|
|
329
|
+
except:
|
|
330
|
+
pass
|
|
331
|
+
|
|
332
|
+
# If terminate detected, ensure LAST item is an assistant message to exit the loop
|
|
333
|
+
# The generic agent loop checks: while new_items[-1].get("role") != "assistant"
|
|
334
|
+
if has_terminate:
|
|
335
|
+
output_items.append(
|
|
336
|
+
{
|
|
337
|
+
"type": "message",
|
|
338
|
+
"role": "assistant",
|
|
339
|
+
"content": [{"type": "output_text", "text": ""}],
|
|
340
|
+
}
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Prepend any pre_output_items (e.g., simulated screenshot-taking message)
|
|
344
|
+
return {"output": (pre_output_items + output_items), "usage": usage}
|
|
345
|
+
|
|
346
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
347
|
+
return ["step"]
|
|
348
|
+
|
|
349
|
+
async def predict_click(
|
|
350
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
351
|
+
) -> Optional[Tuple[int, int]]:
|
|
352
|
+
"""
|
|
353
|
+
Predict click coordinates using Qwen3-VL via litellm.acompletion.
|
|
354
|
+
|
|
355
|
+
Only exposes a reduced tool schema with left_click to bias model to output a single click.
|
|
356
|
+
Returns (x, y) absolute pixels when screen dimensions can be obtained; otherwise normalized 0..1000 integers.
|
|
357
|
+
"""
|
|
358
|
+
# Reduced tool
|
|
359
|
+
reduced_tool = {
|
|
360
|
+
"type": "function",
|
|
361
|
+
"function": {
|
|
362
|
+
**QWEN3_COMPUTER_TOOL["function"],
|
|
363
|
+
"parameters": {
|
|
364
|
+
"type": "object",
|
|
365
|
+
"properties": {
|
|
366
|
+
"action": {"type": "string", "enum": ["left_click"]},
|
|
367
|
+
"coordinate": {
|
|
368
|
+
"description": "(x, y) in 0..1000 reference space",
|
|
369
|
+
"type": "array",
|
|
370
|
+
"items": {"type": ["number", "integer"]},
|
|
371
|
+
"minItems": 2,
|
|
372
|
+
"maxItems": 2,
|
|
373
|
+
},
|
|
374
|
+
},
|
|
375
|
+
"required": ["action", "coordinate"],
|
|
376
|
+
},
|
|
377
|
+
},
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
# Build Nous system (lazy import inside helper already raises clear guidance if missing)
|
|
381
|
+
nous_system = build_nous_system([reduced_tool["function"]])
|
|
382
|
+
|
|
383
|
+
# Pre-process using smart_resize
|
|
384
|
+
min_pixels = 3136
|
|
385
|
+
max_pixels = 12845056
|
|
386
|
+
try:
|
|
387
|
+
# Lazy import to avoid hard dependency
|
|
388
|
+
import base64
|
|
389
|
+
import io
|
|
390
|
+
|
|
391
|
+
# If PIL is available, estimate size from image to derive smart bounds
|
|
392
|
+
from PIL import Image
|
|
393
|
+
from qwen_vl_utils import smart_resize # type: ignore
|
|
394
|
+
|
|
395
|
+
img_bytes = base64.b64decode(image_b64)
|
|
396
|
+
im = Image.open(io.BytesIO(img_bytes))
|
|
397
|
+
h, w = im.height, im.width
|
|
398
|
+
rh, rw = smart_resize(h, w, factor=28, min_pixels=min_pixels, max_pixels=max_pixels)
|
|
399
|
+
except Exception:
|
|
400
|
+
raise ImportError(
|
|
401
|
+
"qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
messages = []
|
|
405
|
+
if nous_system:
|
|
406
|
+
messages.append(nous_system)
|
|
407
|
+
image_block: Dict[str, Any] = {
|
|
408
|
+
"type": "image_url",
|
|
409
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
410
|
+
"min_pixels": min_pixels,
|
|
411
|
+
"max_pixels": max_pixels,
|
|
412
|
+
}
|
|
413
|
+
# Single user message with image and instruction, matching OpenAI-style content blocks
|
|
414
|
+
messages.append(
|
|
415
|
+
{
|
|
416
|
+
"role": "user",
|
|
417
|
+
"content": [
|
|
418
|
+
image_block,
|
|
419
|
+
{"type": "text", "text": instruction},
|
|
420
|
+
],
|
|
421
|
+
}
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
api_kwargs: Dict[str, Any] = {
|
|
425
|
+
"model": model,
|
|
426
|
+
"messages": messages,
|
|
427
|
+
**{k: v for k, v in kwargs.items()},
|
|
428
|
+
}
|
|
429
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
430
|
+
resp = response.model_dump() # type: ignore
|
|
431
|
+
choice = (resp.get("choices") or [{}])[0]
|
|
432
|
+
content_text = ((choice.get("message") or {}).get("content")) or ""
|
|
433
|
+
tool_call = parse_tool_call_from_text(content_text) or {}
|
|
434
|
+
args = tool_call.get("arguments") or {}
|
|
435
|
+
args = await unnormalize_coordinate(args, (rh, rw))
|
|
436
|
+
coord = args.get("coordinate")
|
|
437
|
+
if isinstance(coord, (list, tuple)) and len(coord) >= 2:
|
|
438
|
+
return int(coord[0]), int(coord[1])
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
# ComputerUse tool schema (OpenAI function tool format)
|
|
443
|
+
QWEN3_COMPUTER_TOOL: dict[str, Any] = {
|
|
444
|
+
"type": "function",
|
|
445
|
+
"function": {
|
|
446
|
+
"name": "computer",
|
|
447
|
+
"description": (
|
|
448
|
+
"Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
|
|
449
|
+
"* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n"
|
|
450
|
+
"* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n"
|
|
451
|
+
"* The screen's resolution is 1000x1000.\n"
|
|
452
|
+
"* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n"
|
|
453
|
+
"* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n"
|
|
454
|
+
"* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges."
|
|
455
|
+
),
|
|
456
|
+
"parameters": {
|
|
457
|
+
"type": "object",
|
|
458
|
+
"properties": {
|
|
459
|
+
"action": {
|
|
460
|
+
"description": "The action to perform.",
|
|
461
|
+
"enum": [
|
|
462
|
+
"key",
|
|
463
|
+
"type",
|
|
464
|
+
"mouse_move",
|
|
465
|
+
"left_click",
|
|
466
|
+
"left_click_drag",
|
|
467
|
+
"right_click",
|
|
468
|
+
"middle_click",
|
|
469
|
+
"double_click",
|
|
470
|
+
"triple_click",
|
|
471
|
+
"scroll",
|
|
472
|
+
"hscroll",
|
|
473
|
+
"screenshot",
|
|
474
|
+
"wait",
|
|
475
|
+
],
|
|
476
|
+
"type": "string",
|
|
477
|
+
},
|
|
478
|
+
"keys": {
|
|
479
|
+
"description": "Required only by action=key.",
|
|
480
|
+
"type": "array",
|
|
481
|
+
"items": {"type": "string"},
|
|
482
|
+
},
|
|
483
|
+
"text": {
|
|
484
|
+
"description": "Required only by action=type and action=answer.",
|
|
485
|
+
"type": "string",
|
|
486
|
+
},
|
|
487
|
+
"coordinate": {
|
|
488
|
+
"description": "(x, y): Pixel coordinates from top-left.",
|
|
489
|
+
"type": "array",
|
|
490
|
+
"items": {"type": ["number", "integer"]},
|
|
491
|
+
"minItems": 2,
|
|
492
|
+
"maxItems": 2,
|
|
493
|
+
},
|
|
494
|
+
"pixels": {
|
|
495
|
+
"description": "Scroll amount. Positive=up, negative=down. For scroll/hscroll.",
|
|
496
|
+
"type": "number",
|
|
497
|
+
},
|
|
498
|
+
"time": {
|
|
499
|
+
"description": "Seconds to wait (action=wait).",
|
|
500
|
+
"type": "number",
|
|
501
|
+
},
|
|
502
|
+
},
|
|
503
|
+
"required": ["action"],
|
|
504
|
+
},
|
|
505
|
+
},
|
|
506
|
+
}
|