PyPI - cua-agent - Versions diffs - 0.4.35__py3-none-any.whl → 0.4.36__py3-none-any.whl - Mend

cua-agent 0.4.35py3-none-any.whl → 0.4.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (10) hide show

agent/computers/base.py +6 -2
agent/computers/cua.py +6 -2
agent/computers/custom.py +6 -2
agent/loops/__init__.py +1 -1
agent/loops/omniparser.py +6 -4
agent/loops/qwen.py +63 -28
{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/METADATA +1 -1
{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/RECORD +10 -10
{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0

agent/computers/base.py CHANGED Viewed

@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
         """Get screen dimensions as (width, height)."""
         ...
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         ...
     async def click(self, x: int, y: int, button: str = "left") -> None:

agent/computers/cua.py CHANGED Viewed

@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
         screen_size = await self.interface.get_screen_size()
         return screen_size["width"], screen_size["height"]
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         assert self.interface is not None
         screenshot_bytes = await self.interface.screenshot()
         return base64.b64encode(screenshot_bytes).decode("utf-8")

agent/computers/custom.py CHANGED Viewed

@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):
         return self._last_screenshot_size
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         result = await self._call_function(self.functions["screenshot"])
         b64_str = self._to_b64_str(result)  # type: ignore

agent/loops/__init__.py CHANGED Viewed

@@ -15,8 +15,8 @@ from . import (
     omniparser,
     openai,
     opencua,
-    uitars,
     qwen,
+    uitars,
 )
 __all__ = [

agent/loops/omniparser.py CHANGED Viewed

@@ -243,18 +243,20 @@ async def replace_computer_call_with_function(
                 "id": item.get("id"),
                 "call_id": item.get("call_id"),
                 "status": "completed",
-                # Fall back to string representation
-                "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
             }
         ]
     elif item_type == "computer_call_output":
-        # Simple conversion: computer_call_output -> function_call_output
+        output = item.get("output")
+        if isinstance(output, dict):
+            output = [output]
         return [
             {
                 "type": "function_call_output",
                 "call_id": item.get("call_id"),
-                "content": [item.get("output")],
+                "output": output,
                 "id": item.get("id"),
                 "status": "completed",
             }

agent/loops/qwen.py CHANGED Viewed

@@ -3,12 +3,13 @@ Qwen3-VL agent loop implementation using litellm with function/tool calling.
 - Passes a ComputerUse tool schema to acompletion
 - Converts between Responses items and completion messages using helpers
 """
-from __future__ import annotations
-from typing import Any, Dict, List, Optional, Tuple
+from __future__ import annotations
 import json
 import re
+from typing import Any, Dict, List, Optional, Tuple
 import litellm
 from litellm.responses.litellm_completion_transformation.transformation import (
     LiteLLMCompletionResponsesConfig,
@@ -16,12 +17,11 @@ from litellm.responses.litellm_completion_transformation.transformation import (
 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
-from ..types import AgentCapability
 from ..responses import (
-    convert_responses_items_to_completion_messages,
     convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
 )
+from ..types import AgentCapability
 # ComputerUse tool schema (OpenAI function tool format)
 QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
@@ -96,18 +96,29 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
     },
 }
 def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
     """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
     try:
         from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
-            NousFnCallPrompt,
-            Message as NousMessage,
             ContentItem as NousContentItem,
         )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            Message as NousMessage,
+        )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            NousFnCallPrompt,
+        )
     except ImportError:
-        raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
+        raise ImportError(
+            "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
+        )
     msgs = NousFnCallPrompt().preprocess_fncall_messages(
-        messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
+        messages=[
+            NousMessage(
+                role="system", content=[NousContentItem(text="You are a helpful assistant.")]
+            )
+        ],
         functions=functions,
         lang="en",
     )
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
     content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
     return {"role": "system", "content": content}
 def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     """Extract JSON object within <tool_call>...</tool_call> from model text."""
     m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
 async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
     """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
     coord = args.get("coordinate")
@@ -262,7 +275,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
         pre_output_items: List[Dict[str, Any]] = []
         if not _has_any_image(completion_messages):
             if computer_handler is None or not hasattr(computer_handler, "screenshot"):
-                raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
+                raise RuntimeError(
+                    "No screenshots present and computer_handler.screenshot is not available."
+                )
             screenshot_b64 = await computer_handler.screenshot()
             if not screenshot_b64:
                 raise RuntimeError("Failed to capture screenshot from computer_handler.")
@@ -271,7 +286,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
+                        },
                         {"type": "text", "text": "Current screen"},
                     ],
                 }
@@ -282,7 +300,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
                     "type": "message",
                     "role": "assistant",
                     "content": [
-                        {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
+                        {
+                            "type": "text",
+                            "text": "Taking a screenshot to see the current computer screen.",
+                        }
                     ],
                 }
             )
@@ -294,11 +315,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
         MIN_PIXELS = 3136
         MAX_PIXELS = 12845056
         try:
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
             from PIL import Image  # type: ignore
-            import base64, io
+            from qwen_vl_utils import smart_resize  # type: ignore
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
         for msg in completion_messages:
             content = msg.get("content")
@@ -306,14 +331,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
                 continue
             for part in content:
                 if isinstance(part, dict) and part.get("type") == "image_url":
-                    url = (((part.get("image_url") or {}).get("url")) or "")
+                    url = ((part.get("image_url") or {}).get("url")) or ""
                     # Expect data URL like data:image/png;base64,<b64>
                     if url.startswith("data:") and "," in url:
                         b64 = url.split(",", 1)[1]
                         img_bytes = base64.b64decode(b64)
                         im = Image.open(io.BytesIO(img_bytes))
                         h, w = im.height, im.width
-                        rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+                        rh, rw = smart_resize(
+                            h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
+                        )
                         # Attach hints on this image block
                         part["min_pixels"] = MIN_PIXELS
                         part["max_pixels"] = MAX_PIXELS
@@ -349,7 +376,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
         # Parse tool call from text; then convert to responses items via fake tool_calls
         resp_dict = response.model_dump()  # type: ignore
         choice = (resp_dict.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text)
         output_items: List[Dict[str, Any]] = []
@@ -358,7 +385,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
             raw_args = tool_call.get("arguments") or {}
             # Unnormalize coordinates to actual screen size using last resized dims
             if last_rw is None or last_rh is None:
-                raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
+                raise RuntimeError(
+                    "No screenshots found to derive dimensions for coordinate unnormalization."
+                )
             args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
             # Build an OpenAI-style tool call so we can reuse the converter
@@ -426,10 +455,12 @@ class Qwen3VlConfig(AsyncAgentConfig):
         max_pixels = 12845056
         try:
             # Lazy import to avoid hard dependency
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
             # If PIL is available, estimate size from image to derive smart bounds
             from PIL import Image
-            import io, base64
+            from qwen_vl_utils import smart_resize  # type: ignore
             img_bytes = base64.b64decode(image_b64)
             im = Image.open(io.BytesIO(img_bytes))
@@ -437,16 +468,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
             # Qwen notebook suggests factor=32 and a wide min/max range
             rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
         messages = []
         if nous_system:
             messages.append(nous_system)
         image_block: Dict[str, Any] = {
-            "type": "image_url",
-            "image_url": {
-                "url": f"data:image/png;base64,{image_b64}"
-            },
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{image_b64}"},
             "min_pixels": min_pixels,
             "max_pixels": max_pixels,
         }
@@ -461,11 +492,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
             }
         )
-        api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
+        api_kwargs: Dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            **{k: v for k, v in kwargs.items()},
+        }
         response = await litellm.acompletion(**api_kwargs)
         resp = response.model_dump()  # type: ignore
         choice = (resp.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text) or {}
         args = tool_call.get("arguments") or {}
         args = await _unnormalize_coordinate(args, (rh, rw))

{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.4.35
+Version: 0.4.36
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.12

{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/RECORD RENAMED Viewed

@@ -22,9 +22,9 @@ agent/callbacks/telemetry.py,sha256=nCm2vq6ZBPfNvdz_MICn8LyBGLKhKBzYVE4sm31gpzE,
 agent/callbacks/trajectory_saver.py,sha256=4PIcitRlh0rIqmKsgLAvYF2qSrYBO5i_sGq2MvpwMDg,15853
 agent/cli.py,sha256=icDtgET50Ny8lBt7edKfsIiLPdh0Mdt-YzxAfLea5kQ,17296
 agent/computers/__init__.py,sha256=R2L3xdkD8FPcB0_qIp2WrhklnOVGnSaAvVqODuaLXq4,1475
-agent/computers/base.py,sha256=guxW_5EVpmAWtmaI_fQpc1owRFULcU4va6a9aFyg-is,2166
-agent/computers/cua.py,sha256=lDu8HuvJirmlGnab0URB9lTCAnE6S9k3VfkCwzeUROQ,4816
-agent/computers/custom.py,sha256=xszJK-PVz5mrt38HcW5p0PB4zEFAxeyVGz-buRfyDcE,7826
+agent/computers/base.py,sha256=Ud62zbSfgMuJ2Y6JrHVH25GG208rqKJBr4v1EXFfbKI,2310
+agent/computers/cua.py,sha256=xj1Tj8iVeBnQgqR5ALsjjOixpJq-oMcYkhLA_ag8Zbs,4960
+agent/computers/custom.py,sha256=r010ew-tO0mq3sjvEPome2ELTA5tPCtEgInDyhICaak,7970
 agent/decorators.py,sha256=KLSLczVt6AIh8IPp5YUIqJhNMpcbYUu-irCpc6uGKfI,1875
 agent/human_tool/__init__.py,sha256=2lp9aZLdId4iooY6sdMw4TwVmDdAvsKyZFJla99BpA0,748
 agent/human_tool/__main__.py,sha256=P4H50miHpkqRax6sfRG9PSRct2g82RLwfmshFvqpSLs,1069
@@ -33,7 +33,7 @@ agent/human_tool/ui.py,sha256=TiyBXeiSpBX6P96twx3FRU4J36_FfvYLuvgDrBHVHN4,30773
 agent/integrations/hud/__init__.py,sha256=fVJXPhTdu3-2-8h1qC4kTCtsphgajUO-rnuDJbMnvbw,5854
 agent/integrations/hud/agent.py,sha256=vfuU0t1vcwZhpxnuTNXs8-zQQ3p1RxJq53cI3PmGGqw,14544
 agent/integrations/hud/proxy.py,sha256=Kj9grnLbuaCS-2y2TXVuRBQwqifzh-UX0Q916V9PWyY,11718
-agent/loops/__init__.py,sha256=MrVFh0zYLn-cd8mNCKzqwowu5TFkzA1mlJgg69p6bHE,476
+agent/loops/__init__.py,sha256=n9idaCDs53Gheb3cIkZH8j8F54JZ-ymI_-bG_JyiLPU,476
 agent/loops/anthropic.py,sha256=t0YMTLfUnnWjdFXeeELbKcNcYSbbbKo43rYEmGvTcTg,71507
 agent/loops/base.py,sha256=hNEmXnTXEHeYy4WlPqEiatkc35KgEU2C52tHOL2B_JQ,2264
 agent/loops/composed_grounded.py,sha256=Cc5w9gU-5D0MP-Wjb4XLcjuNIN9EeRKXNyMtLwRoq8I,12395
@@ -44,10 +44,10 @@ agent/loops/holo.py,sha256=0FQJifXNrTaNIHaREb8R14byHOmzGvJfe_gUC5p9fP0,7503
 agent/loops/internvl.py,sha256=x9CCwYvANEWrWgO0ThE0trUKng28g5306L3pBT4CEFI,6561
 agent/loops/model_types.csv,sha256=GmFn4x80yoUpQZuQ-GXtJkPVlOLYWZ5u_5A73HRyeNE,112
 agent/loops/moondream3.py,sha256=Dr7rL-yqXD3TR-2YT6xQ588WMVTB_uobdUF-oLtQi_Y,18557
-agent/loops/omniparser.py,sha256=vClGdTufh4eKZYRClNtvUA0tg7hNuj4RWqcF6HohO4U,15592
+agent/loops/omniparser.py,sha256=N4SnPYi7vH84PwEfpyWfutmH7Ya6VON-Y3HVrxwCX4U,15464
 agent/loops/openai.py,sha256=6XWPWa-iW-2cSo60t_4qj9xEy_-5zbiKf_J2Pq1xo8g,8437
 agent/loops/opencua.py,sha256=H3MVJ2ghZgNduBsrUlmpaw3NDPM5xHZUEWMRMJfz0AU,4128
-agent/loops/qwen.py,sha256=ykYGIPLt8jdJ47KFKlY-Dnyc2PoWAbggKw_CnEmcyfg,20026
+agent/loops/qwen.py,sha256=2Vet53J1U5P-cz2Y8A448J2MZfYPdAid_UAdYm_pkBA,20669
 agent/loops/uitars.py,sha256=fLnQeld27S3orzlkbbjL2EPoz-ItR6ssq3sl2eQK-v4,31985
 agent/proxy/examples.py,sha256=rInzhqOE0ZDLN_2D0pbUWrxzkqcXnfujmAKs0THm6mU,6062
 agent/proxy/handlers.py,sha256=gHxx0tf-EfoLfoRQ4hYRcU3Fwh7tts7_5L8mBGcIz40,9306
@@ -58,7 +58,7 @@ agent/ui/__main__.py,sha256=Ee9KF16h4fWlb6J48OBqc7cQEbzSUZgNe0L7GlKsdpg,74
 agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
 agent/ui/gradio/app.py,sha256=ILw0PVMfQgEM2xIjymnLagNk82UtvbjW5qf-pkgRyAM,9089
 agent/ui/gradio/ui_components.py,sha256=trTu7VuPYZgMKwZ4_8TfT3sQE-ILvLmEKdMzMsh0AqU,38964
-cua_agent-0.4.35.dist-info/METADATA,sha256=AjzIOzdJsfI7rf_Fhu-nDBTUfhQDkPPoK_nbg71_zGU,6909
-cua_agent-0.4.35.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
-cua_agent-0.4.35.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
-cua_agent-0.4.35.dist-info/RECORD,,
+cua_agent-0.4.36.dist-info/METADATA,sha256=VRSTrmCsgW83FyvlJUqdqEqeXf6x-WtAkGrE39QMrhU,6909
+cua_agent-0.4.36.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
+cua_agent-0.4.36.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
+cua_agent-0.4.36.dist-info/RECORD,,

{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/WHEEL RENAMED Viewed

File without changes

{cua_agent-0.4.35.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt RENAMED Viewed

File without changes

cua-agent 0.4.35__py3-none-any.whl → 0.4.36__py3-none-any.whl

Potentially problematic release.

cua-agent 0.4.35py3-none-any.whl → 0.4.36py3-none-any.whl