cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +32 -19
  24. agent/computers/cua.py +33 -25
  25. agent/computers/custom.py +78 -71
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +215 -210
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +510 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.36.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/loops/qwen.py ADDED
@@ -0,0 +1,510 @@
1
+ """
2
+ Qwen3-VL agent loop implementation using litellm with function/tool calling.
3
+ - Passes a ComputerUse tool schema to acompletion
4
+ - Converts between Responses items and completion messages using helpers
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import re
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ import litellm
14
+ from litellm.responses.litellm_completion_transformation.transformation import (
15
+ LiteLLMCompletionResponsesConfig,
16
+ )
17
+
18
+ from ..decorators import register_agent
19
+ from ..loops.base import AsyncAgentConfig
20
+ from ..responses import (
21
+ convert_completion_messages_to_responses_items,
22
+ convert_responses_items_to_completion_messages,
23
+ )
24
+ from ..types import AgentCapability
25
+
26
+ # ComputerUse tool schema (OpenAI function tool format)
27
+ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
28
+ "type": "function",
29
+ "function": {
30
+ "name": "computer",
31
+ "description": (
32
+ "Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
33
+ "* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n"
34
+ "* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n"
35
+ "* The screen's resolution is 1000x1000.\n"
36
+ "* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n"
37
+ "* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n"
38
+ "* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges."
39
+ ),
40
+ "parameters": {
41
+ "type": "object",
42
+ "properties": {
43
+ "action": {
44
+ "description": "The action to perform.",
45
+ "enum": [
46
+ "key",
47
+ "type",
48
+ "mouse_move",
49
+ "left_click",
50
+ "left_click_drag",
51
+ "right_click",
52
+ "middle_click",
53
+ "double_click",
54
+ "triple_click",
55
+ "scroll",
56
+ "hscroll",
57
+ "screenshot",
58
+ "wait",
59
+ # "terminate",
60
+ # "answer",
61
+ ],
62
+ "type": "string",
63
+ },
64
+ "keys": {
65
+ "description": "Required only by action=key.",
66
+ "type": "array",
67
+ "items": {"type": "string"},
68
+ },
69
+ "text": {
70
+ "description": "Required only by action=type and action=answer.",
71
+ "type": "string",
72
+ },
73
+ "coordinate": {
74
+ "description": "(x, y): Pixel coordinates from top-left.",
75
+ "type": "array",
76
+ "items": {"type": ["number", "integer"]},
77
+ "minItems": 2,
78
+ "maxItems": 2,
79
+ },
80
+ "pixels": {
81
+ "description": "Scroll amount. Positive=up, negative=down. For scroll/hscroll.",
82
+ "type": "number",
83
+ },
84
+ "time": {
85
+ "description": "Seconds to wait (action=wait).",
86
+ "type": "number",
87
+ },
88
+ # "status": {
89
+ # "description": "Task status (action=terminate).",
90
+ # "type": "string",
91
+ # "enum": ["success", "failure"],
92
+ # },
93
+ },
94
+ "required": ["action"],
95
+ },
96
+ },
97
+ }
98
+
99
+
100
+ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
101
+ """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
102
+ try:
103
+ from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
104
+ ContentItem as NousContentItem,
105
+ )
106
+ from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
107
+ Message as NousMessage,
108
+ )
109
+ from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
110
+ NousFnCallPrompt,
111
+ )
112
+ except ImportError:
113
+ raise ImportError(
114
+ "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
115
+ )
116
+ msgs = NousFnCallPrompt().preprocess_fncall_messages(
117
+ messages=[
118
+ NousMessage(
119
+ role="system", content=[NousContentItem(text="You are a helpful assistant.")]
120
+ )
121
+ ],
122
+ functions=functions,
123
+ lang="en",
124
+ )
125
+ sys = msgs[0].model_dump()
126
+ # Convert qwen-agent structured content to OpenAI-style content list
127
+ content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
128
+ return {"role": "system", "content": content}
129
+
130
+
131
+ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
132
+ """Extract JSON object within <tool_call>...</tool_call> from model text."""
133
+ m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
134
+ if not m:
135
+ return None
136
+ try:
137
+ return json.loads(m.group(1))
138
+ except Exception:
139
+ return None
140
+
141
+
142
+ async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
143
+ """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
144
+ coord = args.get("coordinate")
145
+ if not coord or not isinstance(coord, (list, tuple)) or len(coord) < 2:
146
+ return args
147
+ x, y = float(coord[0]), float(coord[1])
148
+ width, height = float(dims[0]), float(dims[1])
149
+ x_abs = max(0.0, min(width, (x / 1000.0) * width))
150
+ y_abs = max(0.0, min(height, (y / 1000.0) * height))
151
+ args = {**args, "coordinate": [round(x_abs), round(y_abs)]}
152
+ return args
153
+
154
+
155
+ def convert_qwen_tool_args_to_computer_action(args: Dict[str, Any]) -> Optional[Dict[str, Any]]:
156
+ """
157
+ Convert Qwen computer tool arguments to the Computer Calls action schema.
158
+
159
+ Qwen (example):
160
+ {"action": "left_click", "coordinate": [114, 68]}
161
+
162
+ Target (example):
163
+ {"action": "left_click", "x": 114, "y": 68}
164
+
165
+ Other mappings:
166
+ - right_click, middle_click, double_click (triple_click -> double_click)
167
+ - mouse_move -> { action: "move", x, y }
168
+ - key -> { action: "keypress", keys: [...] }
169
+ - type -> { action: "type", text }
170
+ - scroll/hscroll -> { action: "scroll", scroll_x, scroll_y, x, y }
171
+ - wait -> { action: "wait" }
172
+ - terminate/answer are not direct UI actions; return None for now
173
+ """
174
+ if not isinstance(args, dict):
175
+ return None
176
+
177
+ action = args.get("action")
178
+ if not isinstance(action, str):
179
+ return None
180
+
181
+ # Coordinates helper
182
+ coord = args.get("coordinate")
183
+ x = y = None
184
+ if isinstance(coord, (list, tuple)) and len(coord) >= 2:
185
+ try:
186
+ x = int(round(float(coord[0])))
187
+ y = int(round(float(coord[1])))
188
+ except Exception:
189
+ x = y = None
190
+
191
+ # Map actions
192
+ a = action.lower()
193
+ if a in {"left_click", "right_click", "middle_click", "double_click"}:
194
+ if x is None or y is None:
195
+ return None
196
+ return {"action": a, "x": x, "y": y}
197
+ if a == "triple_click":
198
+ # Approximate as double_click
199
+ if x is None or y is None:
200
+ return None
201
+ return {"action": "double_click", "x": x, "y": y}
202
+ if a == "mouse_move":
203
+ if x is None or y is None:
204
+ return None
205
+ return {"action": "move", "x": x, "y": y}
206
+ if a == "key":
207
+ keys = args.get("keys")
208
+ if isinstance(keys, list) and all(isinstance(k, str) for k in keys):
209
+ return {"action": "keypress", "keys": keys}
210
+ return None
211
+ if a == "type":
212
+ text = args.get("text")
213
+ if isinstance(text, str):
214
+ return {"action": "type", "text": text}
215
+ return None
216
+ if a in {"scroll", "hscroll"}:
217
+ pixels = args.get("pixels") or 0
218
+ try:
219
+ pixels_val = int(round(float(pixels)))
220
+ except Exception:
221
+ pixels_val = 0
222
+ scroll_x = pixels_val if a == "hscroll" else 0
223
+ scroll_y = pixels_val if a == "scroll" else 0
224
+ # Include cursor position if available (optional)
225
+ out: Dict[str, Any] = {"action": "scroll", "scroll_x": scroll_x, "scroll_y": scroll_y}
226
+ if x is not None and y is not None:
227
+ out.update({"x": x, "y": y})
228
+ return out
229
+ if a == "wait":
230
+ return {"action": "wait"}
231
+
232
+ # Non-UI or terminal actions: terminate/answer -> not mapped here
233
+ return None
234
+
235
+
236
+ @register_agent(models=r"(?i).*qwen.*", priority=-1)
237
+ class Qwen3VlConfig(AsyncAgentConfig):
238
+ async def predict_step(
239
+ self,
240
+ messages: List[Dict[str, Any]],
241
+ model: str,
242
+ tools: Optional[List[Dict[str, Any]]] = None,
243
+ max_retries: Optional[int] = None,
244
+ stream: bool = False,
245
+ computer_handler=None,
246
+ use_prompt_caching: Optional[bool] = False,
247
+ _on_api_start=None,
248
+ _on_api_end=None,
249
+ _on_usage=None,
250
+ _on_screenshot=None,
251
+ **kwargs,
252
+ ) -> Dict[str, Any]:
253
+ # Build messages using NousFnCallPrompt system with tool schema in text
254
+ # Start with converted conversation (images/text preserved)
255
+ converted_msgs = convert_responses_items_to_completion_messages(
256
+ messages,
257
+ allow_images_in_tool_results=False,
258
+ )
259
+
260
+ # Prepend Nous-generated system if available
261
+ nous_system = _build_nous_system([QWEN3_COMPUTER_TOOL["function"]])
262
+ completion_messages = ([nous_system] if nous_system else []) + converted_msgs
263
+
264
+ # If there is no screenshot in the conversation, take one now and inject it.
265
+ # Also record a pre_output_items assistant message to reflect action.
266
+ def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
267
+ for m in msgs:
268
+ content = m.get("content")
269
+ if isinstance(content, list):
270
+ for p in content:
271
+ if isinstance(p, dict) and p.get("type") == "image_url":
272
+ return True
273
+ return False
274
+
275
+ pre_output_items: List[Dict[str, Any]] = []
276
+ if not _has_any_image(completion_messages):
277
+ if computer_handler is None or not hasattr(computer_handler, "screenshot"):
278
+ raise RuntimeError(
279
+ "No screenshots present and computer_handler.screenshot is not available."
280
+ )
281
+ screenshot_b64 = await computer_handler.screenshot()
282
+ if not screenshot_b64:
283
+ raise RuntimeError("Failed to capture screenshot from computer_handler.")
284
+ # Inject a user message with the screenshot so the model can see current context
285
+ completion_messages.append(
286
+ {
287
+ "role": "user",
288
+ "content": [
289
+ {
290
+ "type": "image_url",
291
+ "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
292
+ },
293
+ {"type": "text", "text": "Current screen"},
294
+ ],
295
+ }
296
+ )
297
+ # Add assistant message to outputs to reflect the action, similar to composed_grounded.py
298
+ pre_output_items.append(
299
+ {
300
+ "type": "message",
301
+ "role": "assistant",
302
+ "content": [
303
+ {
304
+ "type": "text",
305
+ "text": "Taking a screenshot to see the current computer screen.",
306
+ }
307
+ ],
308
+ }
309
+ )
310
+
311
+ # Smart-resize all screenshots and attach min/max pixel hints. Fail fast if deps missing.
312
+ # Also record the last resized width/height to unnormalize coordinates later.
313
+ last_rw: Optional[int] = None
314
+ last_rh: Optional[int] = None
315
+ MIN_PIXELS = 3136
316
+ MAX_PIXELS = 12845056
317
+ try:
318
+ import base64
319
+ import io
320
+
321
+ from PIL import Image # type: ignore
322
+ from qwen_vl_utils import smart_resize # type: ignore
323
+ except Exception:
324
+ raise ImportError(
325
+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
326
+ )
327
+
328
+ for msg in completion_messages:
329
+ content = msg.get("content")
330
+ if not isinstance(content, list):
331
+ continue
332
+ for part in content:
333
+ if isinstance(part, dict) and part.get("type") == "image_url":
334
+ url = ((part.get("image_url") or {}).get("url")) or ""
335
+ # Expect data URL like data:image/png;base64,<b64>
336
+ if url.startswith("data:") and "," in url:
337
+ b64 = url.split(",", 1)[1]
338
+ img_bytes = base64.b64decode(b64)
339
+ im = Image.open(io.BytesIO(img_bytes))
340
+ h, w = im.height, im.width
341
+ rh, rw = smart_resize(
342
+ h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
343
+ )
344
+ # Attach hints on this image block
345
+ part["min_pixels"] = MIN_PIXELS
346
+ part["max_pixels"] = MAX_PIXELS
347
+ last_rw, last_rh = rw, rh
348
+
349
+ api_kwargs: Dict[str, Any] = {
350
+ "model": model,
351
+ "messages": completion_messages,
352
+ "max_retries": max_retries,
353
+ "stream": stream,
354
+ **{k: v for k, v in kwargs.items()},
355
+ }
356
+ if use_prompt_caching:
357
+ api_kwargs["use_prompt_caching"] = use_prompt_caching
358
+
359
+ if _on_api_start:
360
+ await _on_api_start(api_kwargs)
361
+
362
+ response = await litellm.acompletion(**api_kwargs)
363
+
364
+ if _on_api_end:
365
+ await _on_api_end(api_kwargs, response)
366
+
367
+ usage = {
368
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage( # type: ignore
369
+ response.usage
370
+ ).model_dump(),
371
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
372
+ }
373
+ if _on_usage:
374
+ await _on_usage(usage)
375
+
376
+ # Parse tool call from text; then convert to responses items via fake tool_calls
377
+ resp_dict = response.model_dump() # type: ignore
378
+ choice = (resp_dict.get("choices") or [{}])[0]
379
+ content_text = ((choice.get("message") or {}).get("content")) or ""
380
+ tool_call = _parse_tool_call_from_text(content_text)
381
+
382
+ output_items: List[Dict[str, Any]] = []
383
+ if tool_call and isinstance(tool_call, dict):
384
+ fn_name = tool_call.get("name") or "computer"
385
+ raw_args = tool_call.get("arguments") or {}
386
+ # Unnormalize coordinates to actual screen size using last resized dims
387
+ if last_rw is None or last_rh is None:
388
+ raise RuntimeError(
389
+ "No screenshots found to derive dimensions for coordinate unnormalization."
390
+ )
391
+ args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
392
+
393
+ # Build an OpenAI-style tool call so we can reuse the converter
394
+ fake_cm = {
395
+ "role": "assistant",
396
+ "tool_calls": [
397
+ {
398
+ "type": "function",
399
+ "id": "call_0",
400
+ "function": {
401
+ "name": fn_name,
402
+ "arguments": json.dumps(args),
403
+ },
404
+ }
405
+ ],
406
+ }
407
+ output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
408
+ else:
409
+ # Fallback: just return assistant text
410
+ fake_cm = {"role": "assistant", "content": content_text}
411
+ output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
412
+
413
+ # Prepend any pre_output_items (e.g., simulated screenshot-taking message)
414
+ return {"output": (pre_output_items + output_items), "usage": usage}
415
+
416
+ def get_capabilities(self) -> List[AgentCapability]:
417
+ return ["step"]
418
+
419
+ async def predict_click(
420
+ self, model: str, image_b64: str, instruction: str, **kwargs
421
+ ) -> Optional[Tuple[int, int]]:
422
+ """
423
+ Predict click coordinates using Qwen3-VL via litellm.acompletion.
424
+
425
+ Only exposes a reduced tool schema with left_click to bias model to output a single click.
426
+ Returns (x, y) absolute pixels when screen dimensions can be obtained; otherwise normalized 0..1000 integers.
427
+ """
428
+ # Reduced tool
429
+ reduced_tool = {
430
+ "type": "function",
431
+ "function": {
432
+ **QWEN3_COMPUTER_TOOL["function"],
433
+ "parameters": {
434
+ "type": "object",
435
+ "properties": {
436
+ "action": {"type": "string", "enum": ["left_click"]},
437
+ "coordinate": {
438
+ "description": "(x, y) in 0..1000 reference space",
439
+ "type": "array",
440
+ "items": {"type": ["number", "integer"]},
441
+ "minItems": 2,
442
+ "maxItems": 2,
443
+ },
444
+ },
445
+ "required": ["action", "coordinate"],
446
+ },
447
+ },
448
+ }
449
+
450
+ # Build Nous system (lazy import inside helper already raises clear guidance if missing)
451
+ nous_system = _build_nous_system([reduced_tool["function"]])
452
+
453
+ # Pre-process using smart_resize
454
+ min_pixels = 3136
455
+ max_pixels = 12845056
456
+ try:
457
+ # Lazy import to avoid hard dependency
458
+ import base64
459
+ import io
460
+
461
+ # If PIL is available, estimate size from image to derive smart bounds
462
+ from PIL import Image
463
+ from qwen_vl_utils import smart_resize # type: ignore
464
+
465
+ img_bytes = base64.b64decode(image_b64)
466
+ im = Image.open(io.BytesIO(img_bytes))
467
+ h, w = im.height, im.width
468
+ # Qwen notebook suggests factor=32 and a wide min/max range
469
+ rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
470
+ except Exception:
471
+ raise ImportError(
472
+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
473
+ )
474
+
475
+ messages = []
476
+ if nous_system:
477
+ messages.append(nous_system)
478
+ image_block: Dict[str, Any] = {
479
+ "type": "image_url",
480
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
481
+ "min_pixels": min_pixels,
482
+ "max_pixels": max_pixels,
483
+ }
484
+ # Single user message with image and instruction, matching OpenAI-style content blocks
485
+ messages.append(
486
+ {
487
+ "role": "user",
488
+ "content": [
489
+ image_block,
490
+ {"type": "text", "text": instruction},
491
+ ],
492
+ }
493
+ )
494
+
495
+ api_kwargs: Dict[str, Any] = {
496
+ "model": model,
497
+ "messages": messages,
498
+ **{k: v for k, v in kwargs.items()},
499
+ }
500
+ response = await litellm.acompletion(**api_kwargs)
501
+ resp = response.model_dump() # type: ignore
502
+ choice = (resp.get("choices") or [{}])[0]
503
+ content_text = ((choice.get("message") or {}).get("content")) or ""
504
+ tool_call = _parse_tool_call_from_text(content_text) or {}
505
+ args = tool_call.get("arguments") or {}
506
+ args = await _unnormalize_coordinate(args, (rh, rw))
507
+ coord = args.get("coordinate")
508
+ if isinstance(coord, (list, tuple)) and len(coord) >= 2:
509
+ return int(coord[0]), int(coord[1])
510
+ return None