cua-agent 0.4.35__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/computers/base.py CHANGED
@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
28
28
  """Get screen dimensions as (width, height)."""
29
29
  ...
30
30
 
31
- async def screenshot(self) -> str:
32
- """Take a screenshot and return as base64 string."""
31
+ async def screenshot(self, text: Optional[str] = None) -> str:
32
+ """Take a screenshot and return as base64 string.
33
+
34
+ Args:
35
+ text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
36
+ """
33
37
  ...
34
38
 
35
39
  async def click(self, x: int, y: int, button: str = "left") -> None:
agent/computers/cua.py CHANGED
@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
36
36
  screen_size = await self.interface.get_screen_size()
37
37
  return screen_size["width"], screen_size["height"]
38
38
 
39
- async def screenshot(self) -> str:
40
- """Take a screenshot and return as base64 string."""
39
+ async def screenshot(self, text: Optional[str] = None) -> str:
40
+ """Take a screenshot and return as base64 string.
41
+
42
+ Args:
43
+ text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
44
+ """
41
45
  assert self.interface is not None
42
46
  screenshot_bytes = await self.interface.screenshot()
43
47
  return base64.b64encode(screenshot_bytes).decode("utf-8")
agent/computers/custom.py CHANGED
@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):
122
122
 
123
123
  return self._last_screenshot_size
124
124
 
125
- async def screenshot(self) -> str:
126
- """Take a screenshot and return as base64 string."""
125
+ async def screenshot(self, text: Optional[str] = None) -> str:
126
+ """Take a screenshot and return as base64 string.
127
+
128
+ Args:
129
+ text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
130
+ """
127
131
  result = await self._call_function(self.functions["screenshot"])
128
132
  b64_str = self._to_b64_str(result) # type: ignore
129
133
 
agent/loops/__init__.py CHANGED
@@ -15,8 +15,8 @@ from . import (
15
15
  omniparser,
16
16
  openai,
17
17
  opencua,
18
- uitars,
19
18
  qwen,
19
+ uitars,
20
20
  )
21
21
 
22
22
  __all__ = [
agent/loops/omniparser.py CHANGED
@@ -243,18 +243,20 @@ async def replace_computer_call_with_function(
243
243
  "id": item.get("id"),
244
244
  "call_id": item.get("call_id"),
245
245
  "status": "completed",
246
- # Fall back to string representation
247
- "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
248
246
  }
249
247
  ]
250
248
 
251
249
  elif item_type == "computer_call_output":
252
- # Simple conversion: computer_call_output -> function_call_output
250
+ output = item.get("output")
251
+
252
+ if isinstance(output, dict):
253
+ output = [output]
254
+
253
255
  return [
254
256
  {
255
257
  "type": "function_call_output",
256
258
  "call_id": item.get("call_id"),
257
- "content": [item.get("output")],
259
+ "output": output,
258
260
  "id": item.get("id"),
259
261
  "status": "completed",
260
262
  }
agent/loops/qwen.py CHANGED
@@ -3,12 +3,13 @@ Qwen3-VL agent loop implementation using litellm with function/tool calling.
3
3
  - Passes a ComputerUse tool schema to acompletion
4
4
  - Converts between Responses items and completion messages using helpers
5
5
  """
6
- from __future__ import annotations
7
6
 
8
- from typing import Any, Dict, List, Optional, Tuple
7
+ from __future__ import annotations
9
8
 
10
9
  import json
11
10
  import re
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
12
13
  import litellm
13
14
  from litellm.responses.litellm_completion_transformation.transformation import (
14
15
  LiteLLMCompletionResponsesConfig,
@@ -16,12 +17,11 @@ from litellm.responses.litellm_completion_transformation.transformation import (
16
17
 
17
18
  from ..decorators import register_agent
18
19
  from ..loops.base import AsyncAgentConfig
19
- from ..types import AgentCapability
20
20
  from ..responses import (
21
- convert_responses_items_to_completion_messages,
22
21
  convert_completion_messages_to_responses_items,
22
+ convert_responses_items_to_completion_messages,
23
23
  )
24
-
24
+ from ..types import AgentCapability
25
25
 
26
26
  # ComputerUse tool schema (OpenAI function tool format)
27
27
  QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
@@ -96,18 +96,29 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
96
96
  },
97
97
  }
98
98
 
99
+
99
100
  def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
100
101
  """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
101
102
  try:
102
103
  from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
103
- NousFnCallPrompt,
104
- Message as NousMessage,
105
104
  ContentItem as NousContentItem,
106
105
  )
106
+ from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
107
+ Message as NousMessage,
108
+ )
109
+ from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
110
+ NousFnCallPrompt,
111
+ )
107
112
  except ImportError:
108
- raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
113
+ raise ImportError(
114
+ "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
115
+ )
109
116
  msgs = NousFnCallPrompt().preprocess_fncall_messages(
110
- messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
117
+ messages=[
118
+ NousMessage(
119
+ role="system", content=[NousContentItem(text="You are a helpful assistant.")]
120
+ )
121
+ ],
111
122
  functions=functions,
112
123
  lang="en",
113
124
  )
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
116
127
  content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
117
128
  return {"role": "system", "content": content}
118
129
 
130
+
119
131
  def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
120
132
  """Extract JSON object within <tool_call>...</tool_call> from model text."""
121
133
  m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
126
138
  except Exception:
127
139
  return None
128
140
 
141
+
129
142
  async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
130
143
  """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
131
144
  coord = args.get("coordinate")
@@ -262,7 +275,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
262
275
  pre_output_items: List[Dict[str, Any]] = []
263
276
  if not _has_any_image(completion_messages):
264
277
  if computer_handler is None or not hasattr(computer_handler, "screenshot"):
265
- raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
278
+ raise RuntimeError(
279
+ "No screenshots present and computer_handler.screenshot is not available."
280
+ )
266
281
  screenshot_b64 = await computer_handler.screenshot()
267
282
  if not screenshot_b64:
268
283
  raise RuntimeError("Failed to capture screenshot from computer_handler.")
@@ -271,7 +286,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
271
286
  {
272
287
  "role": "user",
273
288
  "content": [
274
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
289
+ {
290
+ "type": "image_url",
291
+ "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
292
+ },
275
293
  {"type": "text", "text": "Current screen"},
276
294
  ],
277
295
  }
@@ -282,7 +300,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
282
300
  "type": "message",
283
301
  "role": "assistant",
284
302
  "content": [
285
- {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
303
+ {
304
+ "type": "text",
305
+ "text": "Taking a screenshot to see the current computer screen.",
306
+ }
286
307
  ],
287
308
  }
288
309
  )
@@ -294,11 +315,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
294
315
  MIN_PIXELS = 3136
295
316
  MAX_PIXELS = 12845056
296
317
  try:
297
- from qwen_vl_utils import smart_resize # type: ignore
318
+ import base64
319
+ import io
320
+
298
321
  from PIL import Image # type: ignore
299
- import base64, io
322
+ from qwen_vl_utils import smart_resize # type: ignore
300
323
  except Exception:
301
- raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
324
+ raise ImportError(
325
+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
326
+ )
302
327
 
303
328
  for msg in completion_messages:
304
329
  content = msg.get("content")
@@ -306,14 +331,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
306
331
  continue
307
332
  for part in content:
308
333
  if isinstance(part, dict) and part.get("type") == "image_url":
309
- url = (((part.get("image_url") or {}).get("url")) or "")
334
+ url = ((part.get("image_url") or {}).get("url")) or ""
310
335
  # Expect data URL like data:image/png;base64,<b64>
311
336
  if url.startswith("data:") and "," in url:
312
337
  b64 = url.split(",", 1)[1]
313
338
  img_bytes = base64.b64decode(b64)
314
339
  im = Image.open(io.BytesIO(img_bytes))
315
340
  h, w = im.height, im.width
316
- rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
341
+ rh, rw = smart_resize(
342
+ h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
343
+ )
317
344
  # Attach hints on this image block
318
345
  part["min_pixels"] = MIN_PIXELS
319
346
  part["max_pixels"] = MAX_PIXELS
@@ -349,7 +376,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
349
376
  # Parse tool call from text; then convert to responses items via fake tool_calls
350
377
  resp_dict = response.model_dump() # type: ignore
351
378
  choice = (resp_dict.get("choices") or [{}])[0]
352
- content_text = (((choice.get("message") or {}).get("content")) or "")
379
+ content_text = ((choice.get("message") or {}).get("content")) or ""
353
380
  tool_call = _parse_tool_call_from_text(content_text)
354
381
 
355
382
  output_items: List[Dict[str, Any]] = []
@@ -358,7 +385,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
358
385
  raw_args = tool_call.get("arguments") or {}
359
386
  # Unnormalize coordinates to actual screen size using last resized dims
360
387
  if last_rw is None or last_rh is None:
361
- raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
388
+ raise RuntimeError(
389
+ "No screenshots found to derive dimensions for coordinate unnormalization."
390
+ )
362
391
  args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
363
392
 
364
393
  # Build an OpenAI-style tool call so we can reuse the converter
@@ -426,10 +455,12 @@ class Qwen3VlConfig(AsyncAgentConfig):
426
455
  max_pixels = 12845056
427
456
  try:
428
457
  # Lazy import to avoid hard dependency
429
- from qwen_vl_utils import smart_resize # type: ignore
458
+ import base64
459
+ import io
460
+
430
461
  # If PIL is available, estimate size from image to derive smart bounds
431
462
  from PIL import Image
432
- import io, base64
463
+ from qwen_vl_utils import smart_resize # type: ignore
433
464
 
434
465
  img_bytes = base64.b64decode(image_b64)
435
466
  im = Image.open(io.BytesIO(img_bytes))
@@ -437,16 +468,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
437
468
  # Qwen notebook suggests factor=32 and a wide min/max range
438
469
  rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
439
470
  except Exception:
440
- raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
471
+ raise ImportError(
472
+ "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
473
+ )
441
474
 
442
475
  messages = []
443
476
  if nous_system:
444
477
  messages.append(nous_system)
445
478
  image_block: Dict[str, Any] = {
446
- "type": "image_url",
447
- "image_url": {
448
- "url": f"data:image/png;base64,{image_b64}"
449
- },
479
+ "type": "image_url",
480
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
450
481
  "min_pixels": min_pixels,
451
482
  "max_pixels": max_pixels,
452
483
  }
@@ -461,11 +492,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
461
492
  }
462
493
  )
463
494
 
464
- api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
495
+ api_kwargs: Dict[str, Any] = {
496
+ "model": model,
497
+ "messages": messages,
498
+ **{k: v for k, v in kwargs.items()},
499
+ }
465
500
  response = await litellm.acompletion(**api_kwargs)
466
501
  resp = response.model_dump() # type: ignore
467
502
  choice = (resp.get("choices") or [{}])[0]
468
- content_text = (((choice.get("message") or {}).get("content")) or "")
503
+ content_text = ((choice.get("message") or {}).get("content")) or ""
469
504
  tool_call = _parse_tool_call_from_text(content_text) or {}
470
505
  args = tool_call.get("arguments") or {}
471
506
  args = await _unnormalize_coordinate(args, (rh, rw))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.35
3
+ Version: 0.4.36
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.12
@@ -22,9 +22,9 @@ agent/callbacks/telemetry.py,sha256=nCm2vq6ZBPfNvdz_MICn8LyBGLKhKBzYVE4sm31gpzE,
22
22
  agent/callbacks/trajectory_saver.py,sha256=4PIcitRlh0rIqmKsgLAvYF2qSrYBO5i_sGq2MvpwMDg,15853
23
23
  agent/cli.py,sha256=icDtgET50Ny8lBt7edKfsIiLPdh0Mdt-YzxAfLea5kQ,17296
24
24
  agent/computers/__init__.py,sha256=R2L3xdkD8FPcB0_qIp2WrhklnOVGnSaAvVqODuaLXq4,1475
25
- agent/computers/base.py,sha256=guxW_5EVpmAWtmaI_fQpc1owRFULcU4va6a9aFyg-is,2166
26
- agent/computers/cua.py,sha256=lDu8HuvJirmlGnab0URB9lTCAnE6S9k3VfkCwzeUROQ,4816
27
- agent/computers/custom.py,sha256=xszJK-PVz5mrt38HcW5p0PB4zEFAxeyVGz-buRfyDcE,7826
25
+ agent/computers/base.py,sha256=Ud62zbSfgMuJ2Y6JrHVH25GG208rqKJBr4v1EXFfbKI,2310
26
+ agent/computers/cua.py,sha256=xj1Tj8iVeBnQgqR5ALsjjOixpJq-oMcYkhLA_ag8Zbs,4960
27
+ agent/computers/custom.py,sha256=r010ew-tO0mq3sjvEPome2ELTA5tPCtEgInDyhICaak,7970
28
28
  agent/decorators.py,sha256=KLSLczVt6AIh8IPp5YUIqJhNMpcbYUu-irCpc6uGKfI,1875
29
29
  agent/human_tool/__init__.py,sha256=2lp9aZLdId4iooY6sdMw4TwVmDdAvsKyZFJla99BpA0,748
30
30
  agent/human_tool/__main__.py,sha256=P4H50miHpkqRax6sfRG9PSRct2g82RLwfmshFvqpSLs,1069
@@ -33,7 +33,7 @@ agent/human_tool/ui.py,sha256=TiyBXeiSpBX6P96twx3FRU4J36_FfvYLuvgDrBHVHN4,30773
33
33
  agent/integrations/hud/__init__.py,sha256=fVJXPhTdu3-2-8h1qC4kTCtsphgajUO-rnuDJbMnvbw,5854
34
34
  agent/integrations/hud/agent.py,sha256=vfuU0t1vcwZhpxnuTNXs8-zQQ3p1RxJq53cI3PmGGqw,14544
35
35
  agent/integrations/hud/proxy.py,sha256=Kj9grnLbuaCS-2y2TXVuRBQwqifzh-UX0Q916V9PWyY,11718
36
- agent/loops/__init__.py,sha256=MrVFh0zYLn-cd8mNCKzqwowu5TFkzA1mlJgg69p6bHE,476
36
+ agent/loops/__init__.py,sha256=n9idaCDs53Gheb3cIkZH8j8F54JZ-ymI_-bG_JyiLPU,476
37
37
  agent/loops/anthropic.py,sha256=t0YMTLfUnnWjdFXeeELbKcNcYSbbbKo43rYEmGvTcTg,71507
38
38
  agent/loops/base.py,sha256=hNEmXnTXEHeYy4WlPqEiatkc35KgEU2C52tHOL2B_JQ,2264
39
39
  agent/loops/composed_grounded.py,sha256=Cc5w9gU-5D0MP-Wjb4XLcjuNIN9EeRKXNyMtLwRoq8I,12395
@@ -44,10 +44,10 @@ agent/loops/holo.py,sha256=0FQJifXNrTaNIHaREb8R14byHOmzGvJfe_gUC5p9fP0,7503
44
44
  agent/loops/internvl.py,sha256=x9CCwYvANEWrWgO0ThE0trUKng28g5306L3pBT4CEFI,6561
45
45
  agent/loops/model_types.csv,sha256=GmFn4x80yoUpQZuQ-GXtJkPVlOLYWZ5u_5A73HRyeNE,112
46
46
  agent/loops/moondream3.py,sha256=Dr7rL-yqXD3TR-2YT6xQ588WMVTB_uobdUF-oLtQi_Y,18557
47
- agent/loops/omniparser.py,sha256=vClGdTufh4eKZYRClNtvUA0tg7hNuj4RWqcF6HohO4U,15592
47
+ agent/loops/omniparser.py,sha256=N4SnPYi7vH84PwEfpyWfutmH7Ya6VON-Y3HVrxwCX4U,15464
48
48
  agent/loops/openai.py,sha256=6XWPWa-iW-2cSo60t_4qj9xEy_-5zbiKf_J2Pq1xo8g,8437
49
49
  agent/loops/opencua.py,sha256=H3MVJ2ghZgNduBsrUlmpaw3NDPM5xHZUEWMRMJfz0AU,4128
50
- agent/loops/qwen.py,sha256=ykYGIPLt8jdJ47KFKlY-Dnyc2PoWAbggKw_CnEmcyfg,20026
50
+ agent/loops/qwen.py,sha256=2Vet53J1U5P-cz2Y8A448J2MZfYPdAid_UAdYm_pkBA,20669
51
51
  agent/loops/uitars.py,sha256=fLnQeld27S3orzlkbbjL2EPoz-ItR6ssq3sl2eQK-v4,31985
52
52
  agent/proxy/examples.py,sha256=rInzhqOE0ZDLN_2D0pbUWrxzkqcXnfujmAKs0THm6mU,6062
53
53
  agent/proxy/handlers.py,sha256=gHxx0tf-EfoLfoRQ4hYRcU3Fwh7tts7_5L8mBGcIz40,9306
@@ -58,7 +58,7 @@ agent/ui/__main__.py,sha256=Ee9KF16h4fWlb6J48OBqc7cQEbzSUZgNe0L7GlKsdpg,74
58
58
  agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
59
59
  agent/ui/gradio/app.py,sha256=ILw0PVMfQgEM2xIjymnLagNk82UtvbjW5qf-pkgRyAM,9089
60
60
  agent/ui/gradio/ui_components.py,sha256=trTu7VuPYZgMKwZ4_8TfT3sQE-ILvLmEKdMzMsh0AqU,38964
61
- cua_agent-0.4.35.dist-info/METADATA,sha256=AjzIOzdJsfI7rf_Fhu-nDBTUfhQDkPPoK_nbg71_zGU,6909
62
- cua_agent-0.4.35.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
63
- cua_agent-0.4.35.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
64
- cua_agent-0.4.35.dist-info/RECORD,,
61
+ cua_agent-0.4.36.dist-info/METADATA,sha256=VRSTrmCsgW83FyvlJUqdqEqeXf6x-WtAkGrE39QMrhU,6909
62
+ cua_agent-0.4.36.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
63
+ cua_agent-0.4.36.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
64
+ cua_agent-0.4.36.dist-info/RECORD,,