PyPI - cua-agent - Versions diffs - 0.4.35__tar.gz → 0.4.37__tar.gz - Mend

cua-agent 0.4.35tar.gz → 0.4.37tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show

{cua_agent-0.4.35 → cua_agent-0.4.37}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.4.35
+Version: 0.4.37
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.12

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/base.py RENAMED Viewed

@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
         """Get screen dimensions as (width, height)."""
         ...
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         ...
     async def click(self, x: int, y: int, button: str = "left") -> None:

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/cua.py RENAMED Viewed

@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
         screen_size = await self.interface.get_screen_size()
         return screen_size["width"], screen_size["height"]
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         assert self.interface is not None
         screenshot_bytes = await self.interface.screenshot()
         return base64.b64encode(screenshot_bytes).decode("utf-8")

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/custom.py RENAMED Viewed

@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):
         return self._last_screenshot_size
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
         result = await self._call_function(self.functions["screenshot"])
         b64_str = self._to_b64_str(result)  # type: ignore

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/omniparser.py RENAMED Viewed

@@ -14,67 +14,73 @@ import litellm
 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
+from ..responses import (
+    convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
+)
 from ..types import AgentCapability, AgentResponse, Messages, Tools
 SOM_TOOL_SCHEMA = {
     "type": "function",
-    "name": "computer",
-    "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
-    "parameters": {
-        "type": "object",
-        "properties": {
-            "action": {
-                "type": "string",
-                "enum": [
-                    "screenshot",
-                    "click",
-                    "double_click",
-                    "drag",
-                    "type",
-                    "keypress",
-                    "scroll",
-                    "move",
-                    "wait",
-                    "get_current_url",
-                    "get_dimensions",
-                    "get_environment",
-                ],
-                "description": "The action to perform",
-            },
-            "element_id": {
-                "type": "integer",
-                "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
-            },
-            "start_element_id": {
-                "type": "integer",
-                "description": "The ID of the element to start dragging from (required for drag action)",
-            },
-            "end_element_id": {
-                "type": "integer",
-                "description": "The ID of the element to drag to (required for drag action)",
-            },
-            "text": {
-                "type": "string",
-                "description": "The text to type (required for type action)",
-            },
-            "keys": {
-                "type": "string",
-                "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
-            },
-            "button": {
-                "type": "string",
-                "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
-            },
-            "scroll_x": {
-                "type": "integer",
-                "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
-            },
-            "scroll_y": {
-                "type": "integer",
-                "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+    "function": {
+        "name": "computer",
+        "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": [
+                        "screenshot",
+                        "click",
+                        "double_click",
+                        "drag",
+                        "type",
+                        "keypress",
+                        "scroll",
+                        "move",
+                        "wait",
+                        "get_current_url",
+                        "get_dimensions",
+                        "get_environment",
+                    ],
+                    "description": "The action to perform",
+                },
+                "element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
+                },
+                "start_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to start dragging from (required for drag action)",
+                },
+                "end_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to drag to (required for drag action)",
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text to type (required for type action)",
+                },
+                "keys": {
+                    "type": "string",
+                    "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
+                },
+                "button": {
+                    "type": "string",
+                    "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
+                },
+                "scroll_x": {
+                    "type": "integer",
+                    "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
+                },
+                "scroll_y": {
+                    "type": "integer",
+                    "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+                },
             },
+            "required": ["action", "element_id"],
         },
-        "required": ["action"],
     },
 }
@@ -243,18 +249,20 @@ async def replace_computer_call_with_function(
                 "id": item.get("id"),
                 "call_id": item.get("call_id"),
                 "status": "completed",
-                # Fall back to string representation
-                "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
             }
         ]
     elif item_type == "computer_call_output":
-        # Simple conversion: computer_call_output -> function_call_output
+        output = item.get("output")
+        if isinstance(output, dict):
+            output = [output]
         return [
             {
                 "type": "function_call_output",
                 "call_id": item.get("call_id"),
-                "content": [item.get("output")],
+                "output": item.get("output"),
                 "id": item.get("id"),
                 "status": "completed",
             }
@@ -296,6 +304,13 @@ class OmniparserConfig(AsyncAgentConfig):
         llm_model = model.split("+")[-1]
+        # Get screen dimensions from computer handler
+        try:
+            width, height = await computer_handler.get_dimensions()
+        except Exception:
+            # Fallback to default dimensions if method fails
+            width, height = 1024, 768
         # Prepare tools for OpenAI API
         openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
@@ -309,27 +324,43 @@ class OmniparserConfig(AsyncAgentConfig):
                 result = parser.parse(image_data)
                 if _on_screenshot:
                     await _on_screenshot(result.annotated_image_base64, "annotated_image")
+                # Convert OmniParser normalized coordinates (0-1) to absolute pixels, convert to pixels
                 for element in result.elements:
-                    id2xy[element.id] = (
-                        (element.bbox.x1 + element.bbox.x2) / 2,
-                        (element.bbox.y1 + element.bbox.y2) / 2,
-                    )
-        # handle computer calls -> function calls
-        new_messages = []
-        for message in messages:
+                    norm_x = (element.bbox.x1 + element.bbox.x2) / 2
+                    norm_y = (element.bbox.y1 + element.bbox.y2) / 2
+                    pixel_x = int(norm_x * width)
+                    pixel_y = int(norm_y * height)
+                    id2xy[element.id] = (pixel_x, pixel_y)
+                # Replace the original screenshot with the annotated image
+                annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
+                last_computer_call_output["output"]["image_url"] = annotated_image_url
+        xy2id = {v: k for k, v in id2xy.items()}
+        messages_with_element_ids = []
+        for i, message in enumerate(messages):
             if not isinstance(message, dict):
                 message = message.__dict__
-            new_messages += await replace_computer_call_with_function(message, id2xy)  # type: ignore
-        messages = new_messages
+            msg_type = message.get("type")
+            if msg_type == "computer_call" and "action" in message:
+                action = message.get("action", {})
+            converted = await replace_computer_call_with_function(message, xy2id)  # type: ignore
+            messages_with_element_ids += converted
+        completion_messages = convert_responses_items_to_completion_messages(
+            messages_with_element_ids, allow_images_in_tool_results=False
+        )
         # Prepare API call kwargs
         api_kwargs = {
             "model": llm_model,
-            "input": messages,
+            "messages": completion_messages,
             "tools": openai_tools if openai_tools else None,
             "stream": stream,
-            "truncation": "auto",
             "num_retries": max_retries,
             **kwargs,
         }
@@ -340,8 +371,8 @@ class OmniparserConfig(AsyncAgentConfig):
         print(str(api_kwargs)[:1000])
-        # Use liteLLM responses
-        response = await litellm.aresponses(**api_kwargs)
+        # Use liteLLM completion
+        response = await litellm.acompletion(**api_kwargs)
         # Call API end hook
         if _on_api_end:
@@ -355,12 +386,45 @@ class OmniparserConfig(AsyncAgentConfig):
         if _on_usage:
             await _on_usage(usage)
-        # handle som function calls -> xy computer calls
-        new_output = []
-        for i in range(len(response.output)):  # type: ignore
-            new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)  # type: ignore
+        response_dict = response.model_dump()  # type: ignore
+        choice_messages = [choice["message"] for choice in response_dict["choices"]]
+        responses_items = []
+        for choice_message in choice_messages:
+            responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
+        # Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
+        final_output = []
+        for item in responses_items:
+            if item.get("type") == "computer_call" and "action" in item:
+                action = item["action"].copy()
+                # Handle single element_id
+                if "element_id" in action:
+                    element_id = action["element_id"]
+                    if element_id in id2xy:
+                        x, y = id2xy[element_id]
+                        action["x"] = x
+                        action["y"] = y
+                        del action["element_id"]
+                # Handle start_element_id and end_element_id for drag operations
+                elif "start_element_id" in action and "end_element_id" in action:
+                    start_id = action["start_element_id"]
+                    end_id = action["end_element_id"]
+                    if start_id in id2xy and end_id in id2xy:
+                        start_x, start_y = id2xy[start_id]
+                        end_x, end_y = id2xy[end_id]
+                        action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
+                        del action["start_element_id"]
+                        del action["end_element_id"]
+                converted_item = item.copy()
+                converted_item["action"] = action
+                final_output.append(converted_item)
+            else:
+                final_output.append(item)
-        return {"output": new_output, "usage": usage}
+        return {"output": final_output, "usage": usage}
     async def predict_click(
         self, model: str, image_b64: str, instruction: str, **kwargs

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/qwen.py RENAMED Viewed

@@ -3,12 +3,13 @@ Qwen3-VL agent loop implementation using litellm with function/tool calling.
 - Passes a ComputerUse tool schema to acompletion
 - Converts between Responses items and completion messages using helpers
 """
-from __future__ import annotations
-from typing import Any, Dict, List, Optional, Tuple
+from __future__ import annotations
 import json
 import re
+from typing import Any, Dict, List, Optional, Tuple
 import litellm
 from litellm.responses.litellm_completion_transformation.transformation import (
     LiteLLMCompletionResponsesConfig,
@@ -16,12 +17,11 @@ from litellm.responses.litellm_completion_transformation.transformation import (
 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
-from ..types import AgentCapability
 from ..responses import (
-    convert_responses_items_to_completion_messages,
     convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
 )
+from ..types import AgentCapability
 # ComputerUse tool schema (OpenAI function tool format)
 QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
@@ -96,18 +96,29 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
     },
 }
 def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
     """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
     try:
         from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
-            NousFnCallPrompt,
-            Message as NousMessage,
             ContentItem as NousContentItem,
         )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            Message as NousMessage,
+        )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            NousFnCallPrompt,
+        )
     except ImportError:
-        raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
+        raise ImportError(
+            "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
+        )
     msgs = NousFnCallPrompt().preprocess_fncall_messages(
-        messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
+        messages=[
+            NousMessage(
+                role="system", content=[NousContentItem(text="You are a helpful assistant.")]
+            )
+        ],
         functions=functions,
         lang="en",
     )
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
     content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
     return {"role": "system", "content": content}
 def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     """Extract JSON object within <tool_call>...</tool_call> from model text."""
     m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
 async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
     """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
     coord = args.get("coordinate")
@@ -262,7 +275,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
         pre_output_items: List[Dict[str, Any]] = []
         if not _has_any_image(completion_messages):
             if computer_handler is None or not hasattr(computer_handler, "screenshot"):
-                raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
+                raise RuntimeError(
+                    "No screenshots present and computer_handler.screenshot is not available."
+                )
             screenshot_b64 = await computer_handler.screenshot()
             if not screenshot_b64:
                 raise RuntimeError("Failed to capture screenshot from computer_handler.")
@@ -271,7 +286,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
+                        },
                         {"type": "text", "text": "Current screen"},
                     ],
                 }
@@ -282,7 +300,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
                     "type": "message",
                     "role": "assistant",
                     "content": [
-                        {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
+                        {
+                            "type": "text",
+                            "text": "Taking a screenshot to see the current computer screen.",
+                        }
                     ],
                 }
             )
@@ -294,11 +315,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
         MIN_PIXELS = 3136
         MAX_PIXELS = 12845056
         try:
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
             from PIL import Image  # type: ignore
-            import base64, io
+            from qwen_vl_utils import smart_resize  # type: ignore
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
         for msg in completion_messages:
             content = msg.get("content")
@@ -306,14 +331,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
                 continue
             for part in content:
                 if isinstance(part, dict) and part.get("type") == "image_url":
-                    url = (((part.get("image_url") or {}).get("url")) or "")
+                    url = ((part.get("image_url") or {}).get("url")) or ""
                     # Expect data URL like data:image/png;base64,<b64>
                     if url.startswith("data:") and "," in url:
                         b64 = url.split(",", 1)[1]
                         img_bytes = base64.b64decode(b64)
                         im = Image.open(io.BytesIO(img_bytes))
                         h, w = im.height, im.width
-                        rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+                        rh, rw = smart_resize(
+                            h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
+                        )
                         # Attach hints on this image block
                         part["min_pixels"] = MIN_PIXELS
                         part["max_pixels"] = MAX_PIXELS
@@ -349,7 +376,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
         # Parse tool call from text; then convert to responses items via fake tool_calls
         resp_dict = response.model_dump()  # type: ignore
         choice = (resp_dict.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text)
         output_items: List[Dict[str, Any]] = []
@@ -358,7 +385,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
             raw_args = tool_call.get("arguments") or {}
             # Unnormalize coordinates to actual screen size using last resized dims
             if last_rw is None or last_rh is None:
-                raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
+                raise RuntimeError(
+                    "No screenshots found to derive dimensions for coordinate unnormalization."
+                )
             args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
             # Build an OpenAI-style tool call so we can reuse the converter
@@ -426,10 +455,12 @@ class Qwen3VlConfig(AsyncAgentConfig):
         max_pixels = 12845056
         try:
             # Lazy import to avoid hard dependency
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
             # If PIL is available, estimate size from image to derive smart bounds
             from PIL import Image
-            import io, base64
+            from qwen_vl_utils import smart_resize  # type: ignore
             img_bytes = base64.b64decode(image_b64)
             im = Image.open(io.BytesIO(img_bytes))
@@ -437,16 +468,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
             # Qwen notebook suggests factor=32 and a wide min/max range
             rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
         messages = []
         if nous_system:
             messages.append(nous_system)
         image_block: Dict[str, Any] = {
-            "type": "image_url",
-            "image_url": {
-                "url": f"data:image/png;base64,{image_b64}"
-            },
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{image_b64}"},
             "min_pixels": min_pixels,
             "max_pixels": max_pixels,
         }
@@ -461,11 +492,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
             }
         )
-        api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
+        api_kwargs: Dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            **{k: v for k, v in kwargs.items()},
+        }
         response = await litellm.acompletion(**api_kwargs)
         resp = response.model_dump()  # type: ignore
         choice = (resp.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text) or {}
         args = tool_call.get("arguments") or {}
         args = await _unnormalize_coordinate(args, (rh, rw))

{cua_agent-0.4.35 → cua_agent-0.4.37}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
 [project]
 name = "cua-agent"
-version = "0.4.35"
+version = "0.4.37"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [

cua_agent-0.4.37/tests/conftest.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Pytest configuration and shared fixtures for agent package tests.
+This file contains shared fixtures and configuration for all agent tests.
+Following SRP: This file ONLY handles test setup/teardown.
+"""
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+import pytest
+@pytest.fixture
+def mock_litellm():
+    """Mock liteLLM completion calls.
+    Use this fixture to avoid making real LLM API calls during tests.
+    Returns a mock that simulates LLM responses.
+    """
+    with patch("litellm.acompletion") as mock_completion:
+        async def mock_response(*args, **kwargs):
+            """Simulate a typical LLM response."""
+            return {
+                "id": "chatcmpl-test123",
+                "object": "chat.completion",
+                "created": 1234567890,
+                "model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": "This is a mocked response for testing.",
+                        },
+                        "finish_reason": "stop",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 10,
+                    "completion_tokens": 20,
+                    "total_tokens": 30,
+                },
+            }
+        mock_completion.side_effect = mock_response
+        yield mock_completion
+@pytest.fixture
+def mock_computer():
+    """Mock Computer interface for agent tests.
+    Use this fixture to test agent logic without requiring a real Computer instance.
+    """
+    computer = AsyncMock()
+    computer.interface = AsyncMock()
+    computer.interface.screenshot = AsyncMock(return_value=b"fake_screenshot_data")
+    computer.interface.left_click = AsyncMock()
+    computer.interface.type = AsyncMock()
+    computer.interface.key = AsyncMock()
+    # Mock context manager
+    computer.__aenter__ = AsyncMock(return_value=computer)
+    computer.__aexit__ = AsyncMock()
+    return computer
+@pytest.fixture
+def disable_telemetry(monkeypatch):
+    """Disable telemetry for tests.
+    Use this fixture to ensure no telemetry is sent during tests.
+    """
+    monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")
+@pytest.fixture
+def sample_messages():
+    """Provide sample messages for testing.
+    Returns a list of messages in the expected format.
+    """
+    return [{"role": "user", "content": "Take a screenshot and tell me what you see"}]

cua_agent-0.4.37/tests/test_computer_agent.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""Unit tests for ComputerAgent class.
+This file tests ONLY the ComputerAgent initialization and basic functionality.
+Following SRP: This file tests ONE class (ComputerAgent).
+All external dependencies (liteLLM, Computer) are mocked.
+"""
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+import pytest
+class TestComputerAgentInitialization:
+    """Test ComputerAgent initialization (SRP: Only tests initialization)."""
+    @patch("agent.agent.litellm")
+    def test_agent_initialization_with_model(self, mock_litellm, disable_telemetry):
+        """Test that agent can be initialized with a model string."""
+        from agent import ComputerAgent
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        assert agent is not None
+        assert hasattr(agent, "model")
+        assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
+    @patch("agent.agent.litellm")
+    def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
+        """Test that agent can be initialized with tools."""
+        from agent import ComputerAgent
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
+        assert agent is not None
+        assert hasattr(agent, "tools")
+    @patch("agent.agent.litellm")
+    def test_agent_initialization_with_max_budget(self, mock_litellm, disable_telemetry):
+        """Test that agent can be initialized with max trajectory budget."""
+        from agent import ComputerAgent
+        budget = 5.0
+        agent = ComputerAgent(
+            model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
+        )
+        assert agent is not None
+    @patch("agent.agent.litellm")
+    def test_agent_requires_model(self, mock_litellm, disable_telemetry):
+        """Test that agent requires a model parameter."""
+        from agent import ComputerAgent
+        with pytest.raises(TypeError):
+            # Should fail without model parameter - intentionally missing required argument
+            ComputerAgent()  # type: ignore[call-arg]
+class TestComputerAgentRun:
+    """Test ComputerAgent.run() method (SRP: Only tests run logic)."""
+    @pytest.mark.asyncio
+    @patch("agent.agent.litellm")
+    async def test_agent_run_with_messages(self, mock_litellm, disable_telemetry, sample_messages):
+        """Test that agent.run() works with valid messages."""
+        from agent import ComputerAgent
+        # Mock liteLLM response
+        mock_response = {
+            "id": "chatcmpl-test",
+            "choices": [
+                {
+                    "message": {"role": "assistant", "content": "Test response"},
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
+        }
+        mock_litellm.acompletion = AsyncMock(return_value=mock_response)
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        # Run should return an async generator
+        result_generator = agent.run(sample_messages)
+        assert result_generator is not None
+        # Check it's an async generator
+        assert hasattr(result_generator, "__anext__")
+    def test_agent_has_run_method(self, disable_telemetry):
+        """Test that agent has run method available."""
+        from agent import ComputerAgent
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        # Verify run method exists
+        assert hasattr(agent, "run")
+        assert callable(agent.run)
+    def test_agent_has_agent_loop(self, disable_telemetry):
+        """Test that agent has agent_loop initialized."""
+        from agent import ComputerAgent
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        # Verify agent_loop is initialized
+        assert hasattr(agent, "agent_loop")
+        assert agent.agent_loop is not None
+class TestComputerAgentTypes:
+    """Test AgentResponse and Messages types (SRP: Only tests type definitions)."""
+    def test_messages_type_exists(self):
+        """Test that Messages type is exported."""
+        from agent import Messages
+        assert Messages is not None
+    def test_agent_response_type_exists(self):
+        """Test that AgentResponse type is exported."""
+        from agent import AgentResponse
+        assert AgentResponse is not None
+class TestComputerAgentIntegration:
+    """Test ComputerAgent integration with Computer tool (SRP: Integration within package)."""
+    def test_agent_accepts_computer_tool(self, disable_telemetry, mock_computer):
+        """Test that agent can be initialized with Computer tool."""
+        from agent import ComputerAgent
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
+        # Verify agent accepted the tool
+        assert agent is not None
+        assert hasattr(agent, "tools")

{cua_agent-0.4.35 → cua_agent-0.4.37}/README.md RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/__main__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/huggingfacelocal_adapter.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/human_adapter.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/mlxvlm_adapter.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/generic.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/internvl.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/opencua.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/adapters/models/qwen2_5_vl.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/agent.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/base.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/budget_manager.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/image_retention.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/logging.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/operator_validator.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/pii_anonymization.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/prompt_instructions.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/telemetry.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/callbacks/trajectory_saver.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/cli.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/computers/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/decorators.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/__main__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/server.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/human_tool/ui.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/integrations/hud/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/integrations/hud/agent.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/integrations/hud/proxy.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/__init__.py RENAMED Viewed

@@ -15,8 +15,8 @@ from . import (
     omniparser,
     openai,
     opencua,
-    uitars,
     qwen,
+    uitars,
 )
 __all__ = [

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/anthropic.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/base.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/composed_grounded.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/gemini.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/glm45v.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/gta1.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/holo.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/internvl.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/model_types.csv RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/moondream3.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/openai.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/opencua.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/loops/uitars.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/proxy/examples.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/proxy/handlers.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/responses.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/types.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/__main__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/gradio/__init__.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/gradio/app.py RENAMED Viewed

File without changes

{cua_agent-0.4.35 → cua_agent-0.4.37}/agent/ui/gradio/ui_components.py RENAMED Viewed

File without changes

cua-agent 0.4.35__tar.gz → 0.4.37__tar.gz

Potentially problematic release.

cua-agent 0.4.35tar.gz → 0.4.37tar.gz