PyPI - cua-agent - Versions diffs - 0.2.10__tar.gz → 0.2.12__tar.gz - Mend

cua-agent 0.2.10tar.gz → 0.2.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cua-agent might be problematic. Click here for more details.

Files changed (86) hide show

{cua_agent-0.2.10 → cua_agent-0.2.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cua-agent
-Version: 0.2.10
+Version: 0.2.12
 Summary: CUA (Computer Use) Agent for AI-driven computer interaction
 Author-Email: TryCua <gh@trycua.com>
 Requires-Python: >=3.11
@@ -24,6 +24,7 @@ Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "openai"
 Provides-Extra: uitars
 Requires-Dist: httpx<0.29.0,>=0.27.0; extra == "uitars"
 Provides-Extra: uitars-mlx
+Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
 Provides-Extra: ui
 Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "ui"
 Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "ui"
@@ -67,6 +68,7 @@ Requires-Dist: requests<3.0.0,>=2.31.0; extra == "all"
 Requires-Dist: ollama<0.5.0,>=0.4.7; extra == "all"
 Requires-Dist: gradio<6.0.0,>=5.23.3; extra == "all"
 Requires-Dist: python-dotenv<2.0.0,>=1.0.1; extra == "all"
+Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
 Description-Content-Type: text/markdown
 <div align="center">
@@ -105,10 +107,7 @@ pip install "cua-agent[anthropic]" # Anthropic Cua Loop
 pip install "cua-agent[uitars]"    # UI-Tars support
 pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
 pip install "cua-agent[ui]" # Gradio UI for the agent
-# For local UI-TARS with MLX support, you need to manually install mlx-vlm:
-pip install "cua-agent[uitars-mlx]"
-pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id # PR: https://github.com/Blaizzy/mlx-vlm/pull/349
+pip install "cua-agent[uitars-mlx]" # MLX UI-Tars support
 ```
 ## Run

{cua_agent-0.2.10 → cua_agent-0.2.12}/README.md RENAMED Viewed

@@ -34,10 +34,7 @@ pip install "cua-agent[anthropic]" # Anthropic Cua Loop
 pip install "cua-agent[uitars]"    # UI-Tars support
 pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
 pip install "cua-agent[ui]" # Gradio UI for the agent
-# For local UI-TARS with MLX support, you need to manually install mlx-vlm:
-pip install "cua-agent[uitars-mlx]"
-pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id # PR: https://github.com/Blaizzy/mlx-vlm/pull/349
+pip install "cua-agent[uitars-mlx]" # MLX UI-Tars support
 ```
 ## Run

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/__init__.py RENAMED Viewed

@@ -6,7 +6,7 @@ import logging
 __version__ = "0.1.0"
 # Initialize logging
-logger = logging.getLogger("cua.agent")
+logger = logging.getLogger("agent")
 # Initialize telemetry when the package is imported
 try:

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/agent.py RENAMED Viewed

@@ -11,10 +11,8 @@ from .types import AgentResponse
 from .factory import LoopFactory
 from .provider_config import DEFAULT_MODELS, ENV_VARS
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class ComputerAgent:
     """A computer agent that can perform automated tasks using natural language instructions."""

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/messages.py RENAMED Viewed

@@ -81,16 +81,27 @@ class StandardMessageManager:
         if not self.config.num_images_to_keep:
             return messages
-        # Find user messages with images
+        # Find messages with images (both user messages and tool call outputs)
         image_messages = []
         for msg in messages:
+            has_image = False
+            # Check user messages with images
             if msg["role"] == "user" and isinstance(msg["content"], list):
                 has_image = any(
                     item.get("type") == "image_url" or item.get("type") == "image"
                     for item in msg["content"]
                 )
-                if has_image:
-                    image_messages.append(msg)
+            # Check assistant messages with tool calls that have images
+            elif msg["role"] == "assistant" and isinstance(msg["content"], list):
+                for item in msg["content"]:
+                    if item.get("type") == "tool_result" and "base64_image" in item:
+                        has_image = True
+                        break
+            if has_image:
+                image_messages.append(msg)
         # If we don't have more images than the limit, return all messages
         if len(image_messages) <= self.config.num_images_to_keep:
@@ -100,13 +111,35 @@ class StandardMessageManager:
         images_to_keep = image_messages[-self.config.num_images_to_keep :]
         images_to_remove = image_messages[: -self.config.num_images_to_keep]
-        # Create a new message list without the older images
+        # Create a new message list, removing images from older messages
         result = []
         for msg in messages:
             if msg in images_to_remove:
-                # Skip this message
-                continue
-            result.append(msg)
+                # Remove images from this message but keep the text content
+                if msg["role"] == "user" and isinstance(msg["content"], list):
+                    # Keep only text content, remove images
+                    new_content = [
+                        item for item in msg["content"]
+                        if item.get("type") not in ["image_url", "image"]
+                    ]
+                    if new_content:  # Only add if there's still content
+                        result.append({"role": msg["role"], "content": new_content})
+                elif msg["role"] == "assistant" and isinstance(msg["content"], list):
+                    # Remove base64_image from tool_result items
+                    new_content = []
+                    for item in msg["content"]:
+                        if item.get("type") == "tool_result" and "base64_image" in item:
+                            # Create a copy without the base64_image
+                            new_item = {k: v for k, v in item.items() if k != "base64_image"}
+                            new_content.append(new_item)
+                        else:
+                            new_content.append(item)
+                    result.append({"role": msg["role"], "content": new_content})
+                else:
+                    # For other message types, keep as is
+                    result.append(msg)
+            else:
+                result.append(msg)
         return result

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/core/telemetry.py RENAMED Viewed

@@ -34,7 +34,7 @@ flush = _default_flush
 is_telemetry_enabled = _default_is_telemetry_enabled
 is_telemetry_globally_disabled = _default_is_telemetry_globally_disabled
-logger = logging.getLogger("cua.agent.telemetry")
+logger = logging.getLogger("agent.telemetry")
 try:
     # Import from core telemetry

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/anthropic/tools/computer.py RENAMED Viewed

@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 self.logger.info(f"  Coordinates: ({x}, {y})")
                 try:
-                    # Take pre-action screenshot to get current dimensions
-                    pre_screenshot = await self.computer.interface.screenshot()
-                    pre_img = Image.open(io.BytesIO(pre_screenshot))
-                    # Scale image to match screen dimensions if needed
-                    if pre_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
-                        # Save the scaled image back to bytes
-                        buffer = io.BytesIO()
-                        pre_img.save(buffer, format="PNG")
-                        pre_screenshot = buffer.getvalue()
-                    self.logger.info(f"  Current dimensions: {pre_img.width}x{pre_img.height}")
                     # Perform the click action
                     if action == "left_click":
                         self.logger.info(f"Clicking at ({x}, {y})")
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for any UI changes
                     await asyncio.sleep(0.5)
-                    # Take and save post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Performed {action} at ({x}, {y})",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
                 except Exception as e:
                     self.logger.error(f"Error during {action} action: {str(e)}")
                     raise ToolError(f"Failed to perform {action}: {str(e)}")
             else:
                 try:
-                    # Take pre-action screenshot
-                    pre_screenshot = await self.computer.interface.screenshot()
-                    pre_img = Image.open(io.BytesIO(pre_screenshot))
-                    # Scale image if needed
-                    if pre_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                     # Perform the click action
                     if action == "left_click":
                         self.logger.info("Performing left click at current position")
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for any UI changes
                     await asyncio.sleep(0.5)
-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Performed {action} at current position",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
                 except Exception as e:
                     self.logger.error(f"Error during {action} action: {str(e)}")
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 raise ToolError(f"{text} must be a string")
             try:
-                # Take pre-action screenshot
-                pre_screenshot = await self.computer.interface.screenshot()
-                pre_img = Image.open(io.BytesIO(pre_screenshot))
-                # Scale image if needed
-                if pre_img.size != (self.width, self.height):
-                    self.logger.info(
-                        f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
-                    )
-                    if not isinstance(self.width, int) or not isinstance(self.height, int):
-                        raise ToolError("Screen dimensions must be integers")
-                    size = (int(self.width), int(self.height))
-                    pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
                 if action == "key":
                     # Special handling for page up/down on macOS
                     if text.lower() in ["pagedown", "page_down", "page down"]:
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for UI changes
                     await asyncio.sleep(0.5)
-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Pressed key: {output_text}",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
                 elif action == "type":
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     # Wait briefly for UI changes
                     await asyncio.sleep(0.5)
-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
                     return ToolResult(
                         output=f"Typed text: {text}",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                     )
             except Exception as e:
                 self.logger.error(f"Error during {action} action: {str(e)}")
                 raise ToolError(f"Failed to perform {action}: {str(e)}")
-        elif action in ("screenshot", "cursor_position"):
-            if text is not None:
-                raise ToolError(f"text is not accepted for {action}")
-            if coordinate is not None:
-                raise ToolError(f"coordinate is not accepted for {action}")
-            try:
-                if action == "screenshot":
-                    # Take screenshot
-                    screenshot = await self.computer.interface.screenshot()
-                    img = Image.open(io.BytesIO(screenshot))
-                    # Scale image if needed
-                    if img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {img.size} to {self.width}x{self.height}"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        img = img.resize(size, Image.Resampling.LANCZOS)
-                        buffer = io.BytesIO()
-                        img.save(buffer, format="PNG")
-                        screenshot = buffer.getvalue()
-                    return ToolResult(base64_image=base64.b64encode(screenshot).decode())
-                elif action == "cursor_position":
-                    pos = await self.computer.interface.get_cursor_position()
-                    x, y = pos  # Unpack the tuple
-                    return ToolResult(output=f"X={int(x)},Y={int(y)}")
-            except Exception as e:
-                self.logger.error(f"Error during {action} action: {str(e)}")
-                raise ToolError(f"Failed to perform {action}: {str(e)}")
         elif action == "scroll":
             # Implement scroll action
             direction = kwargs.get("direction", "down")
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 # Wait briefly for UI changes
                 await asyncio.sleep(0.5)
-                # Take post-action screenshot
-                post_screenshot = await self.computer.interface.screenshot()
-                post_img = Image.open(io.BytesIO(post_screenshot))
-                # Scale post-action image if needed
-                if post_img.size != (self.width, self.height):
-                    self.logger.info(
-                        f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                    )
-                    post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
-                    buffer = io.BytesIO()
-                    post_img.save(buffer, format="PNG")
-                    post_screenshot = buffer.getvalue()
                 return ToolResult(
                     output=f"Scrolled {direction} by {amount} steps",
-                    base64_image=base64.b64encode(post_screenshot).decode(),
                 )
             except Exception as e:
                 self.logger.error(f"Error during scroll action: {str(e)}")
                 raise ToolError(f"Failed to perform scroll: {str(e)}")
+        elif action == "screenshot":
+            # Take screenshot
+            return await self.screenshot()
+        elif action == "cursor_position":
+            pos = await self.computer.interface.get_cursor_position()
+            x, y = pos  # Unpack the tuple
+            return ToolResult(output=f"X={int(x)},Y={int(y)}")
         raise ToolError(f"Invalid action: {action}")
     async def screenshot(self):

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/omni/loop.py RENAMED Viewed

@@ -26,10 +26,8 @@ from .api_handler import OmniAPIHandler
 from .tools.manager import ToolManager
 from .tools import ToolResult
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def extract_data(input_string: str, data_type: str) -> str:
     """Extract content from code blocks."""
     pattern = f"```{data_type}" + r"(.*?)(```|$)"

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/openai/tools/computer.py RENAMED Viewed

@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
     computer: Computer  # The CUA Computer instance
     logger = logging.getLogger(__name__)
-    _screenshot_delay = 1.0  # macOS is generally faster than X11
-    _scaling_enabled = True
     def __init__(self, computer: Computer):
         """Initialize the computer tool.
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             raise ToolError(f"Failed to execute {type}: {str(e)}")
     async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
-        """Handle different click actions."""
+        """Handle mouse clicks."""
         try:
-            # Perform requested click action
+            # Perform the click based on button type
             if button == "left":
                 await self.computer.interface.left_click(x, y)
             elif button == "right":
                 await self.computer.interface.right_click(x, y)
             elif button == "double":
                 await self.computer.interface.double_click(x, y)
+            else:
+                raise ToolError(f"Unsupported button type: {button}")
-            # Wait for UI to update
-            await asyncio.sleep(0.5)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            # Wait briefly for UI to update
+            await asyncio.sleep(0.3)
             return ToolResult(
                 output=f"Performed {button} click at ({x}, {y})",
-                base64_image=base64_screenshot,
             )
         except Exception as e:
             self.logger.error(f"Error in handle_click: {str(e)}")
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             await asyncio.sleep(0.3)
-            # Take screenshot after typing
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
+            return ToolResult(output=f"Typed: {text}")
         except Exception as e:
             self.logger.error(f"Error in handle_typing: {str(e)}")
             raise ToolError(f"Failed to type '{text}': {str(e)}")
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait briefly
             await asyncio.sleep(0.3)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
+            return ToolResult(output=f"Pressed key: {key}")
         except Exception as e:
             self.logger.error(f"Error in handle_key: {str(e)}")
             raise ToolError(f"Failed to press key '{key}': {str(e)}")
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait briefly
             await asyncio.sleep(0.2)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
+            return ToolResult(output=f"Moved cursor to ({x}, {y})")
         except Exception as e:
             self.logger.error(f"Error in handle_mouse_move: {str(e)}")
             raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait for UI to update
             await asyncio.sleep(0.5)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            return ToolResult(
-                output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
-                base64_image=base64_screenshot,
-            )
+            return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
         except Exception as e:
             self.logger.error(f"Error in handle_scroll: {str(e)}")
             raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             # Wait for UI to update
             await asyncio.sleep(0.5)
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
             return ToolResult(
                 output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
-                base64_image=base64_screenshot,
             )
         except Exception as e:
             self.logger.error(f"Error in handle_drag: {str(e)}")

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/providers/uitars/loop.py RENAMED Viewed

@@ -25,10 +25,8 @@ from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
 from .clients.oaicompat import OAICompatClient
 from .clients.mlxvlm import MLXVLMUITarsClient
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class UITARSLoop(BaseLoop):
     """UI-TARS-specific implementation of the agent loop.

cua_agent-0.2.12/agent/ui/__main__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Main entry point for agent.ui module.
+This allows running the agent UI with:
+    python -m agent.ui
+Instead of:
+    python -m agent.ui.gradio.app
+"""
+from .gradio.app import create_gradio_ui
+if __name__ == "__main__":
+    app = create_gradio_ui()
+    app.launch(share=False, inbrowser=True)

{cua_agent-0.2.10 → cua_agent-0.2.12}/agent/ui/gradio/app.py RENAMED Viewed

@@ -132,11 +132,19 @@ class GradioChatScreenshotHandler(DefaultCallbackHandler):
 # Detect if current device is MacOS
 is_mac = platform.system().lower() == "darwin"
+# Detect if lume is available (host device is macOS)
+is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
+print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
+print("is_mac: ", is_mac)
+print("Lume available: ", is_lume_available)
 # Map model names to specific provider model names
 MODEL_MAPPINGS = {
     "openai": {
         # Default to operator CUA model
         "default": "computer-use-preview",
+        "OpenAI: Computer-Use Preview": "computer-use-preview",
         # Map standard OpenAI model names to CUA-specific model names
         "gpt-4-turbo": "computer-use-preview",
         "gpt-4o": "computer-use-preview",
@@ -147,9 +155,17 @@ MODEL_MAPPINGS = {
     "anthropic": {
         # Default to newest model
         "default": "claude-3-7-sonnet-20250219",
+        # New Claude 4 models
+        "Anthropic: Claude 4 Opus (20250514)": "claude-opus-4-20250514",
+        "Anthropic: Claude 4 Sonnet (20250514)": "claude-sonnet-4-20250514",
+        "claude-opus-4-20250514": "claude-opus-4-20250514",
+        "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
         # Specific Claude models for CUA
-        "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
+        "Anthropic: Claude 3.7 Sonnet (20250219)": "claude-3-5-sonnet-20240620",
+        "Anthropic: Claude 3.5 Sonnet (20240620)": "claude-3-7-sonnet-20250219",
         "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
+        "claude-3-5-sonnet-20240620": "claude-3-5-sonnet-20240620",
         # Map standard model names to CUA-specific model names
         "claude-3-opus": "claude-3-7-sonnet-20250219",
         "claude-3-sonnet": "claude-3-5-sonnet-20240620",
@@ -209,12 +225,12 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
     if agent_loop == AgentLoop.OPENAI:
         provider = LLMProvider.OPENAI
         model_name_to_use = MODEL_MAPPINGS["openai"].get(
-            model_name.lower(), MODEL_MAPPINGS["openai"]["default"]
+            model_name, MODEL_MAPPINGS["openai"]["default"]
         )
     elif agent_loop == AgentLoop.ANTHROPIC:
         provider = LLMProvider.ANTHROPIC
         model_name_to_use = MODEL_MAPPINGS["anthropic"].get(
-            model_name.lower(), MODEL_MAPPINGS["anthropic"]["default"]
+            model_name, MODEL_MAPPINGS["anthropic"]["default"]
         )
     elif agent_loop == AgentLoop.OMNI:
         # Determine provider and clean model name based on the full string from UI
@@ -234,33 +250,11 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
             cleaned_model_name = model_name.split("OMNI: Ollama ", 1)[1]
         elif model_name.startswith("OMNI: Claude "):
             provider = LLMProvider.ANTHROPIC
-            # Extract the canonical model name based on the UI string
-            # e.g., "OMNI: Claude 3.7 Sonnet (20250219)" -> "3.7 Sonnet" and "20250219"
-            parts = model_name.split(" (")
-            model_key_part = parts[0].replace("OMNI: Claude ", "")
-            date_part = parts[1].replace(")", "") if len(parts) > 1 else ""
-            # Normalize the extracted key part for comparison
-            # "3.7 Sonnet" -> "37sonnet"
-            model_key_part_norm = model_key_part.lower().replace(".", "").replace(" ", "")
-            cleaned_model_name = MODEL_MAPPINGS["omni"]["default"]  # Default if not found
-            # Find the canonical name in the main Anthropic map
-            for key_anthropic, val_anthropic in MODEL_MAPPINGS["anthropic"].items():
-                # Normalize the canonical key for comparison
-                # "claude-3-7-sonnet-20250219" -> "claude37sonnet20250219"
-                key_anthropic_norm = key_anthropic.lower().replace("-", "")
-                # Check if the normalized canonical key starts with "claude" + normalized extracted part
-                # AND contains the date part.
-                if (
-                    key_anthropic_norm.startswith("claude" + model_key_part_norm)
-                    and date_part in key_anthropic_norm
-                ):
-                    cleaned_model_name = (
-                        val_anthropic  # Use the canonical name like "claude-3-7-sonnet-20250219"
-                    )
-                    break
+            model_name = model_name.replace("OMNI: ", "Anthropic: ")
+            cleaned_model_name = MODEL_MAPPINGS["anthropic"].get(
+                model_name, MODEL_MAPPINGS["anthropic"]["default"]
+            )
         elif model_name.startswith("OMNI: OpenAI "):
             provider = LLMProvider.OPENAI
             # Extract the model part, e.g., "GPT-4o mini"
@@ -309,6 +303,8 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
         model_name_to_use = MODEL_MAPPINGS["openai"]["default"]
         agent_loop = AgentLoop.OPENAI
+    print(f"Mapping {model_name} and {loop_provider} to {provider}, {model_name_to_use}, {agent_loop}")
     return provider, model_name_to_use, agent_loop
@@ -453,6 +449,9 @@ def create_gradio_ui(
     # Always show models regardless of API key availability
     openai_models = ["OpenAI: Computer-Use Preview"]
     anthropic_models = [
+        "Anthropic: Claude 4 Opus (20250514)",
+        "Anthropic: Claude 4 Sonnet (20250514)",
         "Anthropic: Claude 3.7 Sonnet (20250219)",
         "Anthropic: Claude 3.5 Sonnet (20240620)",
     ]
@@ -460,6 +459,8 @@ def create_gradio_ui(
         "OMNI: OpenAI GPT-4o",
         "OMNI: OpenAI GPT-4o mini",
         "OMNI: OpenAI GPT-4.5-preview",
+        "OMNI: Claude 4 Opus (20250514)",
+        "OMNI: Claude 4 Sonnet (20250514)",
         "OMNI: Claude 3.7 Sonnet (20250219)",
         "OMNI: Claude 3.5 Sonnet (20240620)"
     ]
@@ -729,20 +730,25 @@ if __name__ == "__main__":
                 with gr.Accordion("Computer Configuration", open=True):
                     # Computer configuration options
                     computer_os = gr.Radio(
-                        choices=["macos", "linux"],
+                        choices=["macos", "linux", "windows"],
                         label="Operating System",
                         value="macos",
                         info="Select the operating system for the computer",
                     )
-                    # Detect if current device is MacOS
+                    is_windows = platform.system().lower() == "windows"
                     is_mac = platform.system().lower() == "darwin"
+                    providers = ["cloud"]
+                    if is_lume_available:
+                        providers += ["lume"]
+                    if is_windows:
+                        providers += ["winsandbox"]
                     computer_provider = gr.Radio(
-                        choices=["cloud", "lume"],
+                        choices=providers,
                         label="Provider",
                         value="lume" if is_mac else "cloud",
-                        visible=is_mac,
                         info="Select the computer provider",
                     )

{cua_agent-0.2.10 → cua_agent-0.2.12}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
 [project]
 name = "cua-agent"
-version = "0.2.10"
+version = "0.2.12"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [
@@ -39,7 +39,9 @@ openai = [
 uitars = [
     "httpx>=0.27.0,<0.29.0",
 ]
-uitars-mlx = []
+uitars-mlx = [
+    "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
+]
 ui = [
     "gradio>=5.23.3,<6.0.0",
     "python-dotenv>=1.0.1,<2.0.0",
@@ -86,6 +88,7 @@ all = [
     "ollama>=0.4.7,<0.5.0",
     "gradio>=5.23.3,<6.0.0",
     "python-dotenv>=1.0.1,<2.0.0",
+    "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
 ]
 [tool.pdm]
@@ -109,7 +112,7 @@ target-version = [
 [tool.ruff]
 line-length = 100
-target-version = "0.2.10"
+target-version = "0.2.12"
 select = [
     "E",
     "F",
@@ -123,7 +126,7 @@ docstring-code-format = true
 [tool.mypy]
 strict = true
-python_version = "0.2.10"
+python_version = "0.2.12"
 ignore_missing_imports = true
 disallow_untyped_defs = true
 check_untyped_defs = true