PyPI - cua-agent - Versions diffs - 0.1.30__tar.gz → 0.1.31__tar.gz - Mend

cua-agent 0.1.30tar.gz → 0.1.31tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (84) hide show

{cua_agent-0.1.30 → cua_agent-0.1.31}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.1.30
+Version: 0.1.31
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.10
@@ -101,6 +101,7 @@ pip install "cua-agent[all]"
 # or install specific loop providers
 pip install "cua-agent[openai]" # OpenAI Cua Loop
 pip install "cua-agent[anthropic]" # Anthropic Cua Loop
+pip install "cua-agent[uitars]"    # UI-Tars support
 pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
 pip install "cua-agent[ui]" # Gradio UI for the agent
 ```
@@ -148,7 +149,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
 ## Using the Gradio UI
-The agent includes a Gradio-based user interface for easy interaction. To use it:
+The agent includes a Gradio-based user interface for easier interaction.
+<div align="center">
+    <img src="../../img/agent_gradio_ui.png"/>
+</div>
+To use it:
 ```bash
 # Install with Gradio support

{cua_agent-0.1.30 → cua_agent-0.1.31}/README.md RENAMED Viewed

@@ -31,6 +31,7 @@ pip install "cua-agent[all]"
 # or install specific loop providers
 pip install "cua-agent[openai]" # OpenAI Cua Loop
 pip install "cua-agent[anthropic]" # Anthropic Cua Loop
+pip install "cua-agent[uitars]"    # UI-Tars support
 pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
 pip install "cua-agent[ui]" # Gradio UI for the agent
 ```
@@ -78,7 +79,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
 ## Using the Gradio UI
-The agent includes a Gradio-based user interface for easy interaction. To use it:
+The agent includes a Gradio-based user interface for easier interaction.
+<div align="center">
+    <img src="../../img/agent_gradio_ui.png"/>
+</div>
+To use it:
 ```bash
 # Install with Gradio support

{cua_agent-0.1.30 → cua_agent-0.1.31}/agent/providers/omni/clients/oaicompat.py RENAMED Viewed

@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
-        final_messages = [{"role": "system", "content": system}]
+        final_messages = [
+            {
+                "role": "system",
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
         # Process messages
         for item in messages:
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
                     else:
                         message = {
                             "role": item["role"],
-                            "content": [{"type": "text", "text": item["content"]}],
+                            "content": [{
+                                "type": "text",
+                                "text": item["content"]
+                            }],
                         }
                     final_messages.append(message)
             else:

{cua_agent-0.1.30 → cua_agent-0.1.31}/agent/providers/openai/tools/computer.py RENAMED Viewed

@@ -162,8 +162,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
                 y = kwargs.get("y")
                 if x is None or y is None:
                     raise ToolError("x and y coordinates are required for scroll action")
-                scroll_x = kwargs.get("scroll_x", 0) // 20
-                scroll_y = kwargs.get("scroll_y", 0) // 20
+                scroll_x = kwargs.get("scroll_x", 0) // 50
+                scroll_y = kwargs.get("scroll_y", 0) // 50
                 return await self.handle_scroll(x, y, scroll_x, scroll_y)
             elif type == "screenshot":
                 return await self.screenshot()
@@ -240,11 +240,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             if len(mapped_keys) > 1:
                 # For key combinations (like Ctrl+C)
-                for k in mapped_keys:
-                    await self.computer.interface.press_key(k)
-                await asyncio.sleep(0.1)
-                for k in reversed(mapped_keys):
-                    await self.computer.interface.press_key(k)
+                await self.computer.interface.hotkey(*mapped_keys)
             else:
                 # Single key press
                 await self.computer.interface.press_key(mapped_keys[0])

{cua_agent-0.1.30 → cua_agent-0.1.31}/agent/providers/uitars/clients/oaicompat.py RENAMED Viewed

@@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
-        final_messages = [{"role": "system", "content": system}]
+        final_messages = [
+            {
+                "role": "system",
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
         # Process messages
         for item in messages:
             if isinstance(item, dict):
@@ -138,8 +145,13 @@ class OAICompatClient(BaseUITarsClient):
                     message = {"role": "user", "content": [{"type": "text", "text": item}]}
                 final_messages.append(message)
-        payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
-        payload["max_tokens"] = max_tokens or self.max_tokens
+        payload = {
+            "model": self.model,
+            "messages": final_messages,
+            "max_tokens": max_tokens or self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": 0.7,
+        }
         try:
             async with aiohttp.ClientSession() as session:

{cua_agent-0.1.30 → cua_agent-0.1.31}/agent/providers/uitars/loop.py RENAMED Viewed

@@ -20,7 +20,7 @@ from computer import Computer
 from .utils import add_box_token, parse_actions, parse_action_parameters
 from .tools.manager import ToolManager
 from .tools.computer import ToolResult
-from .prompts import COMPUTER_USE, SYSTEM_PROMPT
+from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
 from .clients.oaicompat import OAICompatClient
@@ -184,7 +184,7 @@ class UITARSLoop(BaseLoop):
         if first_user_idx is not None and instruction:
             # Create the computer use prompt
             user_prompt = COMPUTER_USE.format(
-                instruction=instruction,
+                instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
                 language="English"
             )
@@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop):
                     if self.client is None:
                         raise RuntimeError("Failed to initialize client")
-                # Convert messages to UI-TARS format
+                # Get messages in standard format from the message manager
+                self.message_manager.messages = messages.copy()
                 prepared_messages = self.message_manager.get_messages()
+                # Convert messages to UI-TARS format
                 uitars_messages = self.to_uitars_format(prepared_messages)
                 # Log request

{cua_agent-0.1.30 → cua_agent-0.1.31}/agent/providers/uitars/prompts.py RENAMED Viewed

@@ -1,5 +1,9 @@
 """Prompts for UI-TARS agent."""
+MAC_SPECIFIC_NOTES = """
+(You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
+"""
 SYSTEM_PROMPT = "You are a helpful assistant."
 COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
@@ -56,4 +60,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
 ## User Instruction
 {instruction}
-"""
+"""

{cua_agent-0.1.30 → cua_agent-0.1.31}/agent/providers/uitars/tools/computer.py RENAMED Viewed

@@ -173,9 +173,13 @@ class ComputerTool(BaseComputerTool):
             elif action == "hotkey":
                 if "keys" in kwargs:
                     keys = kwargs["keys"]
-                    for key in keys:
-                        await self.computer.interface.press_key(key)
+                    if len(keys) > 1:
+                        await self.computer.interface.hotkey(*keys)
+                    else:
+                        # Single key press
+                        await self.computer.interface.press_key(keys[0])
                     # Wait for UI to update
                     await asyncio.sleep(0.3)

{cua_agent-0.1.30 → cua_agent-0.1.31}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
 [project]
 name = "cua-agent"
-version = "0.1.30"
+version = "0.1.31"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [
@@ -108,7 +108,7 @@ target-version = [
 [tool.ruff]
 line-length = 100
-target-version = "0.1.30"
+target-version = "0.1.31"
 select = [
     "E",
     "F",
@@ -122,7 +122,7 @@ docstring-code-format = true
 [tool.mypy]
 strict = true
-python_version = "0.1.30"
+python_version = "0.1.31"
 ignore_missing_imports = true
 disallow_untyped_defs = true
 check_untyped_defs = true